radeonsi: clean up si_set_streamout_targets

Merge the branches and reorder code. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26095>
2026-05-08 02:38:04 +02:00 · 2023-10-16 14:19:48 -04:00 · 2023-10-16 14:19:48 -04:00 · 0c773325bb
commit 0c773325bb
parent e7c8d32bfc
1 changed files with 50 additions and 72 deletions
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@ -63,10 +63,12 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
   struct si_context *sctx = (struct si_context *)ctx;
   unsigned old_num_targets = sctx->streamout.num_targets;
   unsigned i;
-   bool wait_now = false;

   /* We are going to unbind the buffers. Mark which caches need to be flushed. */
-   if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
+   if (old_num_targets && sctx->streamout.begin_emitted) {
+      /* Stop streamout. */
+      si_emit_streamout_end(sctx);
+
      /* Since streamout uses vector writes which go through TC L2
       * and most other clients can use TC L2 as well, we don't need
       * to flush it.
@ -76,7 +78,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
       * cases. Thus, flag the TC L2 dirtiness in the resource and
       * handle it at draw call time.
       */
-      for (i = 0; i < sctx->streamout.num_targets; i++)
+      for (i = 0; i < old_num_targets; i++)
         if (sctx->streamout.targets[i])
            si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;

@ -90,53 +92,33 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
       * VS_PARTIAL_FLUSH is required if the buffers are going to be
       * used as an input immediately.
       */
-      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
-
-      if (sctx->gfx_level >= GFX11) {
-         sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
-
-         /* Wait now. This is needed to make sure that GDS is not
-          * busy at the end of IBs.
-          *
-          * Also, the next streamout operation will overwrite GDS,
-          * so we need to make sure that it's idle.
-          */
-         wait_now = true;
-      } else {
-         sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
-      }
-   }
-
-   /* All readers of the streamout targets need to be finished before we can
-    * start writing to the targets.
-    */
-   if (num_targets) {
-      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
-                     SI_CONTEXT_PFP_SYNC_ME;
-   }
-
-   if (sctx->flags)
+      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+                     SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
+   }
+
+   /* TODO: This is a hack that fixes these failures. It shouldn't be necessary.
+    *    spec@ext_transform_feedback@immediate-reuse
+    *    spec@ext_transform_feedback@immediate-reuse-index-buffer
+    *    spec@ext_transform_feedback@immediate-reuse-uniform-buffer
+    *    .. and some dEQP-GLES[23].functional.fragment_ops.random.*
+    */
+   if (sctx->gfx_level >= GFX11)
+      si_flush_gfx_cs(sctx, 0, NULL);

   /* Streamout buffers must be bound in 2 places:
    * 1) in VGT by setting the VGT_STRMOUT registers
    * 2) as shader resources
    */
-
-   /* Stop streamout. */
-   if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
-      si_emit_streamout_end(sctx);
-
-   /* TODO: This is a hack that fixes streamout failures. It shouldn't be necessary. */
-   if (sctx->gfx_level >= GFX11 && !wait_now)
-      si_flush_gfx_cs(sctx, 0, NULL);
-
-   /* Set the new targets. */
   unsigned enabled_mask = 0, append_bitmask = 0;
+
   for (i = 0; i < num_targets; i++) {
      si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
-      if (!targets[i])
+
+      if (!targets[i]) {
+         si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
         continue;
+      }

      enabled_mask |= 1 << i;

@ -151,52 +133,48 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
                              &t->buf_filled_size_offset,
                              (struct pipe_resource **)&t->buf_filled_size);
      }
-   }

-   for (; i < sctx->streamout.num_targets; i++)
+      /* Bind it to the shader. */
+      struct pipe_shader_buffer sbuf;
+      sbuf.buffer = targets[i]->buffer;
+
+      if (sctx->gfx_level >= GFX11) {
+         sbuf.buffer_offset = targets[i]->buffer_offset;
+         sbuf.buffer_size = targets[i]->buffer_size;
+      } else {
+         sbuf.buffer_offset = 0;
+         sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size;
+      }
+
+      si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
+      si_resource(targets[i]->buffer)->bind_history |= SI_BIND_STREAMOUT_BUFFER;
+   }
+   for (; i < old_num_targets; i++) {
      si_so_target_reference(&sctx->streamout.targets[i], NULL);
-
-   if (!!sctx->streamout.enabled_mask != !!enabled_mask) {
-      sctx->streamout.enabled_mask = enabled_mask;
-      sctx->do_update_shaders = true; /* to keep/remove streamout shader code as an optimization */
+      si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
   }

+   if (!!sctx->streamout.enabled_mask != !!enabled_mask)
+      sctx->do_update_shaders = true; /* to keep/remove streamout shader code as an optimization */
+
   sctx->streamout.num_targets = num_targets;
+   sctx->streamout.enabled_mask = enabled_mask;
   sctx->streamout.append_bitmask = append_bitmask;

   /* Update dirty state bits. */
   if (num_targets) {
      si_streamout_buffers_dirty(sctx);
+
+      /* All readers of the streamout targets need to be finished before we can
+       * start writing to them.
+       */
+      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+                     SI_CONTEXT_PFP_SYNC_ME;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
   } else {
      si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
      si_set_streamout_enable(sctx, false);
   }
-
-   /* Set the shader resources.*/
-   for (i = 0; i < num_targets; i++) {
-      if (targets[i]) {
-         struct pipe_shader_buffer sbuf;
-         sbuf.buffer = targets[i]->buffer;
-
-         if (sctx->gfx_level >= GFX11) {
-            sbuf.buffer_offset = targets[i]->buffer_offset;
-            sbuf.buffer_size = targets[i]->buffer_size;
-         } else {
-            sbuf.buffer_offset = 0;
-            sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size;
-         }
-
-         si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
-         si_resource(targets[i]->buffer)->bind_history |= SI_BIND_STREAMOUT_BUFFER;
-      } else {
-         si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
-      }
-   }
-   for (; i < old_num_targets; i++)
-      si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
-
-   if (wait_now)
-      si_emit_cache_flush_direct(sctx);
 }

 static void si_flush_vgt_streamout(struct si_context *sctx)