radeonsi: gather pipe_stream_output_info from NIR intrinsics

This stops pipe_stream_output_info from create_*s_state context functions because NIR contains everything and can do more advanced shader linking this way. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14414>
2026-05-09 04:38:03 +02:00 · 2021-12-19 20:10:03 -05:00 · 2021-12-19 20:10:03 -05:00 · b57a163b7d
commit b57a163b7d
parent 981bd8cbe2
8 changed files with 41 additions and 35 deletions
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@ -606,7 +606,7 @@ static unsigned ngg_nogs_vertex_size(struct si_shader *shader)

   /* The edgeflag is always stored in the last element that's also
    * used for padding to reduce LDS bank conflicts. */
-   if (shader->selector->so.num_outputs)
+   if (shader->selector->info.enabled_streamout_buffer_mask)
      lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
   if (gfx10_ngg_writes_user_edgeflags(shader))
      lds_vertex_size = MAX2(lds_vertex_size, 1);
@ -2169,7 +2169,7 @@ unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader)
 {
   const struct si_shader_selector *sel = shader->selector;

-   if (sel->info.stage == MESA_SHADER_GEOMETRY && sel->so.num_outputs)
+   if (sel->info.stage == MESA_SHADER_GEOMETRY && sel->info.enabled_streamout_buffer_mask)
      return 44;

   return 8;
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -814,7 +814,7 @@ struct si_streamout {

   /* External state which comes from the vertex shader,
    * it must be set explicitly when binding a shader. */
-   uint16_t *stride_in_dw;
+   uint8_t *stride_in_dw;
   unsigned enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */

   /* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -27,6 +27,7 @@
 #include "nir.h"
 #include "nir_builder.h"
 #include "nir_serialize.h"
+#include "nir/nir_helpers.h"
 #include "si_pipe.h"
 #include "si_shader_internal.h"
 #include "sid.h"
@ -1587,7 +1588,9 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
   bool free_nir;
   struct nir_shader *nir = si_get_nir_shader(sel, &shader->key, &free_nir);

-   struct pipe_stream_output_info so = sel->so;
+   struct pipe_stream_output_info so = {};
+   if (sel->info.enabled_streamout_buffer_mask)
+      nir_gather_stream_output_info(nir, &so);

   /* Dump NIR before doing NIR->LLVM conversion in case the
    * conversion fails. */
@ -1616,7 +1619,7 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi

   /* The GS copy shader is compiled next. */
   if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
-      shader->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
+      shader->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, &so, debug);
      if (!shader->gs_copy_shader) {
         fprintf(stderr, "radeonsi: can't create GS copy shader\n");
         return false;
@ -2312,7 +2315,7 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
   shader->uses_vs_state_outprim = sscreen->use_ngg &&
                                   /* Only used by streamout in vertex shaders. */
                                   sel->info.stage == MESA_SHADER_VERTEX &&
-                                   sel->so.num_outputs;
+                                   sel->info.enabled_streamout_buffer_mask;

   if (sel->info.stage == MESA_SHADER_VERTEX) {
      shader->uses_base_instance = sel->info.uses_base_instance ||
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@ -367,6 +367,7 @@ struct si_shader_info {

   int constbuf0_num_slots;
   ubyte num_stream_output_components[4];
+   uint16_t enabled_streamout_buffer_mask;

   uint num_memory_stores;

@ -459,7 +460,6 @@ struct si_shader_selector {
   void *nir_binary;
   unsigned nir_size;

-   struct pipe_stream_output_info so;
   struct si_shader_info info;

   enum pipe_shader_type pipe_shader_type;
@ -486,7 +486,6 @@ struct si_shader_selector {
   uint16_t gsvs_vertex_size;
   ubyte gs_input_verts_per_prim;
   unsigned max_gsvs_emit_size;
-   uint16_t enabled_streamout_buffer_mask;
   bool tess_turns_off_ngg;

   /* PS parameters. */
@ -959,6 +958,7 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
 struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
                                             struct ac_llvm_compiler *compiler,
                                             struct si_shader_selector *gs_selector,
+                                             const struct pipe_stream_output_info *so,
                                             struct util_debug_callback *debug);

 /* si_shader_nir.c */
--- a/src/gallium/drivers/radeonsi/si_shader_info.c
+++ b/src/gallium/drivers/radeonsi/si_shader_info.c
@ -325,6 +325,7 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr
                                  (nir_intrinsic_component(intr) * 2);
            unsigned new_mask = mask & ~info->output_usagemask[loc];

+            /* Iterate over all components. */
            for (unsigned i = 0; i < 4; i++) {
               unsigned stream = (gs_streams >> (i * 2)) & 0x3;

@ -332,6 +333,16 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr
                  info->output_streams[loc] |= stream << (i * 2);
                  info->num_stream_output_components[stream]++;
               }
+
+               if (nir_intrinsic_has_io_xfb(intr)) {
+                  nir_io_xfb xfb = i < 2 ? nir_intrinsic_io_xfb(intr) :
+                                           nir_intrinsic_io_xfb2(intr);
+                  if (xfb.out[i % 2].num_components) {
+                     unsigned stream = (gs_streams >> (i * 2)) & 0x3;
+                     info->enabled_streamout_buffer_mask |=
+                        BITFIELD_BIT(stream * 4 + xfb.out[i % 2].buffer);
+                  }
+               }
            }

            if (nir_intrinsic_has_src_type(intr))
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@ -422,6 +422,7 @@ void si_preload_gs_rings(struct si_shader_context *ctx)
 struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
                                             struct ac_llvm_compiler *compiler,
                                             struct si_shader_selector *gs_selector,
+                                             const struct pipe_stream_output_info *so,
                                             struct util_debug_callback *debug)
 {
   struct si_shader_context ctx;
@ -446,7 +447,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
   si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size);
   ctx.shader = shader;
   ctx.stage = MESA_SHADER_VERTEX;
-   ctx.so = gs_selector->so;
+   ctx.so = *so;

   builder = ctx.ac.builder;

--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@ -203,9 +203,6 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
   _mesa_sha1_init(&ctx);
   _mesa_sha1_update(&ctx, &shader_variant_flags, 4);
   _mesa_sha1_update(&ctx, ir_binary, ir_size);
-   if (sel->info.stage == MESA_SHADER_VERTEX || sel->info.stage == MESA_SHADER_TESS_EVAL ||
-       sel->info.stage == MESA_SHADER_GEOMETRY)
-      _mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so));
   _mesa_sha1_final(&ctx, ir_sha1_cache_key);

   if (ir_binary == blob.data)
@ -1512,7 +1509,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
   }

   shader->ctx_reg.ngg.vgt_stages.u.ngg = 1;
-   shader->ctx_reg.ngg.vgt_stages.u.streamout = gs_sel->so.num_outputs;
+   shader->ctx_reg.ngg.vgt_stages.u.streamout = !!gs_sel->info.enabled_streamout_buffer_mask;
   shader->ctx_reg.ngg.vgt_stages.u.ngg_passthrough = gfx10_is_ngg_passthrough(shader);
   shader->ctx_reg.ngg.vgt_stages.u.gs_wave32 = shader->wave_size == 32;
 }
@ -1702,11 +1699,11 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
      rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8);

   if (!sscreen->use_ngg_streamout) {
-      rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) |
-               S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
-               S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
-               S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
-               S_00B12C_SO_EN(!!shader->selector->so.num_outputs);
+      rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->info.base.xfb_stride[0]) |
+               S_00B12C_SO_BASE1_EN(!!shader->selector->info.base.xfb_stride[1]) |
+               S_00B12C_SO_BASE2_EN(!!shader->selector->info.base.xfb_stride[2]) |
+               S_00B12C_SO_BASE3_EN(!!shader->selector->info.base.xfb_stride[3]) |
+               S_00B12C_SO_EN(!!info->enabled_streamout_buffer_mask);
   }

   si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
@ -2783,7 +2780,7 @@ int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state
   }
 }

-static void si_parse_next_shader_property(const struct si_shader_info *info, bool streamout,
+static void si_parse_next_shader_property(const struct si_shader_info *info,
                                          union si_shader_key *key)
 {
   gl_shader_stage next_shader = info->base.next_stage;
@ -2804,7 +2801,7 @@ static void si_parse_next_shader_property(const struct si_shader_info *info, boo
          * assume that it's a HW LS. (the next shader is TCS)
          * This heuristic is needed for separate shader objects.
          */
-         if (!info->writes_position && !streamout)
+         if (!info->writes_position && !info->enabled_streamout_buffer_mask)
            key->ge.as_ls = 1;
      }
      break;
@ -2874,10 +2871,11 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind

      shader->selector = sel;
      shader->is_monolithic = false;
-      si_parse_next_shader_property(&sel->info, sel->so.num_outputs != 0, &shader->key);
+      si_parse_next_shader_property(&sel->info, &shader->key);

      if (sel->info.stage <= MESA_SHADER_GEOMETRY &&
-          sscreen->use_ngg && (!sel->so.num_outputs || sscreen->use_ngg_streamout) &&
+          sscreen->use_ngg && (!sel->info.enabled_streamout_buffer_mask ||
+                               sscreen->use_ngg_streamout) &&
          ((sel->info.stage == MESA_SHADER_VERTEX && !shader->key.ge.as_ls) ||
           sel->info.stage == MESA_SHADER_TESS_EVAL || sel->info.stage == MESA_SHADER_GEOMETRY))
         shader->key.ge.as_ngg = 1;
@ -3035,8 +3033,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
   sel->compiler_ctx_state.debug = sctx->debug;
   sel->compiler_ctx_state.is_debug_context = sctx->is_debug;

-   sel->so = state->stream_output;
-
   if (state->type == PIPE_SHADER_IR_TGSI) {
      sel->nir = tgsi_to_nir(state->tokens, ctx->screen, true);
   } else {
@ -3057,12 +3053,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
   si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers,
                            &sel->active_samplers_and_images);

-   /* Record which streamout buffers are enabled. */
-   for (unsigned i = 0; i < sel->so.num_outputs; i++) {
-      sel->enabled_streamout_buffer_mask |= (1 << sel->so.output[i].output_buffer)
-                                            << (sel->so.output[i].stream * 4);
-   }
-
   sel->num_vs_inputs =
      sel->info.stage == MESA_SHADER_VERTEX && !sel->info.base.vs.blit_sgprs_amd
         ? sel->info.num_inputs
@ -3197,7 +3187,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
      !sel->info.writes_viewport_index && /* cull only against viewport 0 */
      !sel->info.base.writes_memory &&
      /* NGG GS supports culling with streamout because it culls after streamout. */
-      (sel->info.stage == MESA_SHADER_GEOMETRY || !sel->so.num_outputs) &&
+      (sel->info.stage == MESA_SHADER_GEOMETRY || !sel->info.enabled_streamout_buffer_mask) &&
      (sel->info.stage != MESA_SHADER_GEOMETRY || sel->info.num_stream_output_components[0]) &&
      (sel->info.stage != MESA_SHADER_VERTEX ||
       (!sel->info.base.vs.blit_sgprs_amd &&
@ -3312,8 +3302,8 @@ static void si_update_streamout_state(struct si_context *sctx)
   if (!shader_with_so)
      return;

-   sctx->streamout.enabled_stream_buffers_mask = shader_with_so->enabled_streamout_buffer_mask;
-   sctx->streamout.stride_in_dw = shader_with_so->so.stride;
+   sctx->streamout.enabled_stream_buffers_mask = shader_with_so->info.enabled_streamout_buffer_mask;
+   sctx->streamout.stride_in_dw = shader_with_so->info.base.xfb_stride;
 }

 static void si_update_clip_regs(struct si_context *sctx, struct si_shader_selector *old_hw_vs,
@ -3440,7 +3430,8 @@ bool si_update_ngg(struct si_context *sctx)
   } else if (!sctx->screen->use_ngg_streamout) {
      struct si_shader_selector *last = si_get_vs(sctx)->cso;

-      if ((last && last->so.num_outputs) || sctx->streamout.prims_gen_query_enabled)
+      if ((last && last->info.enabled_streamout_buffer_mask) ||
+          sctx->streamout.prims_gen_query_enabled)
         new_ngg = false;
   }

--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@ -308,7 +308,7 @@ static void si_emit_streamout_begin(struct si_context *sctx)
 {
   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
   struct si_streamout_target **t = sctx->streamout.targets;
-   uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
+   uint8_t *stride_in_dw = sctx->streamout.stride_in_dw;
   unsigned i;

   si_flush_vgt_streamout(sctx);