radeonsi: mostly fix NGG streamout overflow queries when XFB is disabled

When XFB was disabled, we were incrementing primitives_generated but not primitives_emitted, which caused the overflow query to return true, but it should have returned false because XFB was disabled. This disables counting primitives_generated when there is no primitives_generated query. When both primitives_generated and the overflow query are enabled simultaneously and XFB is disabled, it will be incorrect again, but that had been equally incorrect with the non-NGG codepath too, just not discovered because of the lack of tests. This commit just changes NGG streamout queries to behave the same as legacy. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37849>
2026-05-04 22:49:13 +02:00 · 2025-10-12 23:00:04 -04:00 · 2025-10-12 23:00:04 -04:00 · dd4df28ef2
commit dd4df28ef2
parent 02db1fbe82
6 changed files with 66 additions and 37 deletions
--- a/src/gallium/drivers/radeonsi/ci/gfx11-navi31-fail.csv
+++ b/src/gallium/drivers/radeonsi/ci/gfx11-navi31-fail.csv
@ -423,9 +423,6 @@ KHR-GL46.sparse_texture_clamp_tests.SparseTextureClampLookupColor_texture_1d_rgb
 KHR-GL46.sparse_texture_clamp_tests.SparseTextureClampLookupColor_texture_1d_rgba8_snorm,Fail
 KHR-GL46.sparse_texture_clamp_tests.SparseTextureClampLookupColor_texture_1d_rgba8i,Fail
 KHR-GL46.sparse_texture_clamp_tests.SparseTextureClampLookupColor_texture_1d_rgba8ui,Fail
-KHR-GL46.transform_feedback_overflow_query_ARB.advanced-single-stream-interleaved-attribs,Fail
-KHR-GL46.transform_feedback_overflow_query_ARB.advanced-single-stream-separate-attribs,Fail
-KHR-GL46.transform_feedback_overflow_query_ARB.multiple-streams-multiple-buffers-per-stream,Fail
 KHR-GL46.transform_feedback_overflow_query_ARB.multiple-streams-one-buffer-per-stream,Fail
 KHR-GL46.texture_query_lod.sampler1D_test,Fail
 KHR-GL46.texture_query_lod.sampler2D_test,Fail
--- a/src/gallium/drivers/radeonsi/ci/gfx12-gfx1200-fail.csv
+++ b/src/gallium/drivers/radeonsi/ci/gfx12-gfx1200-fail.csv
@ -30,10 +30,6 @@ spec@glsl-es-1.00@linker@glsl-mismatched-uniform-precision-unused,Fail
 ## Fail because GFX10+ removed MS texture support (see si_get_sparse_texture_virtual_page_size)
 KHR-GL46.sparse_texture2_tests.SparseTexture2Allocation,Fail
 KHR-GL46.sparse_texture2_tests.SparseTexture2Commitment,Fail
-## https://gitlab.freedesktop.org/mesa/mesa/-/issues/636
-KHR-GL46.transform_feedback_overflow_query_ARB.advanced-single-stream-interleaved-attribs,Fail
-KHR-GL46.transform_feedback_overflow_query_ARB.advanced-single-stream-separate-attribs,Fail
-KHR-GL46.transform_feedback_overflow_query_ARB.multiple-streams-multiple-buffers-per-stream,Fail
 KHR-GL46.transform_feedback_overflow_query_ARB.multiple-streams-one-buffer-per-stream,Fail

 # See Khronos issue 5587: the test expects one-dimensional (array) texture to work while
--- a/src/gallium/drivers/radeonsi/gfx11_query.c
+++ b/src/gallium/drivers/radeonsi/gfx11_query.c
@ -109,7 +109,6 @@ success:
   sbuf.buffer_offset = qbuf->head;
   sbuf.buffer_size = sizeof(struct gfx11_sh_query_buffer_mem);
   si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, &sbuf);
-   SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 1);

   si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
   return true;
@ -135,8 +134,14 @@ static bool gfx11_sh_query_begin(struct si_context *sctx, struct si_query *rquer
   query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
   query->first_begin = query->first->head;

-   sctx->streamout.num_ngg_queries++;
   query->first->refcount++;
+   si_update_prims_generated_query_state(sctx, query->b.type, 1);
+
+   /* Update num_ngg_streamout_queries. */
+   bool old_streamout_query_enable_state = si_get_streamout_enable_state(sctx);
+   sctx->streamout.num_ngg_queries++;
+   if (old_streamout_query_enable_state != si_get_streamout_enable_state(sctx))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);

   return true;
 }
@ -161,11 +166,16 @@ static bool gfx11_sh_query_end(struct si_context *sctx, struct si_query *rquery)
                        0xffffffff, PIPE_QUERY_GPU_FINISHED);
   }

+   si_update_prims_generated_query_state(sctx, query->b.type, -1);
+
+   /* Update num_ngg_streamout_queries. */
+   bool old_streamout_query_enable_state = si_get_streamout_enable_state(sctx);
   sctx->streamout.num_ngg_queries--;
+   if (old_streamout_query_enable_state != si_get_streamout_enable_state(sctx))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);

   if (sctx->streamout.num_ngg_queries <= 0 || !si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) {
      si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
-      SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 0);

      /* If a query_begin is followed by a query_end without a draw
       * in-between, we need to clear the atom to ensure that the
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@ -616,8 +616,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
         si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
      si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
      si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
-      if (ctx->gfx_level < GFX11)
-         si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
+      si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
      /* CLEAR_STATE disables all window rectangles. */
      if (!has_clear_state || ctx->num_window_rectangles > 0)
         si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -1896,7 +1896,32 @@ static inline struct si_shader_ctx_state *si_get_vs(struct si_context *sctx)

 static inline bool si_get_streamout_enable_state(struct si_context *sctx)
 {
-   return sctx->streamout.streamout_enabled || sctx->streamout.prims_gen_query_enabled;
+   /* For GFX11, return whether NGG streamout queries are enabled. For older gens, return whether
+    * streamout hw is enabled.
+    *
+    * Note that when both PRIMITIVES_GENERATED and SO_OVERFLOW queries are enabled and XFB is
+    * disabled, SO_OVERFLOW queries will incorrectly return true because PRIMITIVES_GENERATED
+    * is incremented and PRIMITIVES_EMITTED is not. The problem is that SO_OVERFLOW queries
+    * are implemented by comparing PRIMITIVES_GENERATED and PRIMITIVES_EMITTED, however, when
+    * XFB is disabled, SO_OVERFLOW queries should increment neither PRIMITIVES_GENERATED nor
+    * PRIMITIVES_EMITTED, but when a separate PRIMITIVES_GENERATED is active, we should increment
+    * it. So the 2 queries are in conflict when XFB is disabled.
+    *
+    * Possible solutions:
+    * - For NGG: Emulate SO_OVERFLOW queries using memory stores separately from PRIMITIVES_GENERATED.
+    * - For legacy: Emulate SO_OVERFLOW queries using memory stores, same as NGG.
+    */
+   if (sctx->gfx_level >= GFX11) {
+      /* Enable NGG streamout queries when PRIMITIVES_GENERATED queries are active or when
+       * streamout is enabled and any streamout queries except PRIMITIVES_GENERATED are active.
+       */
+      return sctx->streamout.prims_gen_query_enabled ||
+            (sctx->streamout.streamout_enabled &&
+              (sctx->streamout.num_ngg_queries -
+               sctx->streamout.prims_gen_query_enabled > 0));
+   } else {
+      return sctx->streamout.streamout_enabled || sctx->streamout.prims_gen_query_enabled;
+   }
 }

 static inline unsigned si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size)
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@ -419,41 +419,45 @@ void si_emit_streamout_end(struct si_context *sctx)

 static void si_emit_streamout_enable(struct si_context *sctx, unsigned index)
 {
-   assert(sctx->gfx_level < GFX11);
-
-   radeon_begin(&sctx->gfx_cs);
-   radeon_set_context_reg_seq(R_028B94_VGT_STRMOUT_CONFIG, 2);
-   radeon_emit(S_028B94_STREAMOUT_0_EN(si_get_streamout_enable_state(sctx)) |
-               S_028B94_RAST_STREAM(0) |
-               S_028B94_STREAMOUT_1_EN(si_get_streamout_enable_state(sctx)) |
-               S_028B94_STREAMOUT_2_EN(si_get_streamout_enable_state(sctx)) |
-               S_028B94_STREAMOUT_3_EN(si_get_streamout_enable_state(sctx)));
-   radeon_emit(sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
-   radeon_end();
+   if (sctx->gfx_level >= GFX11) {
+      SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED,
+                si_get_streamout_enable_state(sctx));
+   } else {
+      radeon_begin(&sctx->gfx_cs);
+      radeon_set_context_reg_seq(R_028B94_VGT_STRMOUT_CONFIG, 2);
+      radeon_emit(S_028B94_STREAMOUT_0_EN(si_get_streamout_enable_state(sctx)) |
+                  S_028B94_RAST_STREAM(0) |
+                  S_028B94_STREAMOUT_1_EN(si_get_streamout_enable_state(sctx)) |
+                  S_028B94_STREAMOUT_2_EN(si_get_streamout_enable_state(sctx)) |
+                  S_028B94_STREAMOUT_3_EN(si_get_streamout_enable_state(sctx)));
+      radeon_emit(sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
+      radeon_end();
+   }
 }

 static void si_set_streamout_enable(struct si_context *sctx, bool enable)
 {
-   if (sctx->gfx_level >= GFX11)
-      return;
-
   bool old_strmout_en = si_get_streamout_enable_state(sctx);
   unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;

   sctx->streamout.streamout_enabled = enable;

-   sctx->streamout.hw_enabled_mask =
-      sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) |
-      (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12);
-
-   if ((old_strmout_en != si_get_streamout_enable_state(sctx)) ||
-       (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))
+   if (old_strmout_en != si_get_streamout_enable_state(sctx))
      si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
+
+   if (sctx->gfx_level < GFX11) {
+      sctx->streamout.hw_enabled_mask =
+         sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) |
+         (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12);
+
+      if (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
+   }
 }

 void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff)
 {
-   if (sctx->gfx_level < GFX11 && type == PIPE_QUERY_PRIMITIVES_GENERATED) {
+   if (type == PIPE_QUERY_PRIMITIVES_GENERATED) {
      bool old_strmout_en = si_get_streamout_enable_state(sctx);

      sctx->streamout.num_prims_gen_queries += diff;
@ -479,7 +483,5 @@ void si_init_streamout_functions(struct si_context *sctx)
   sctx->b.stream_output_target_destroy = si_so_target_destroy;
   sctx->b.set_stream_output_targets = si_set_streamout_targets;
   sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
-
-   if (sctx->gfx_level < GFX11)
-      sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
+   sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
 }