radeonsi: mostly fix NGG streamout overflow queries when XFB is disabled
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

When XFB was disabled, we were incrementing primitives_generated but not
primitives_emitted, which caused the overflow query to return true, but
it should have returned false because XFB was disabled.

This disables counting primitives_generated when there is no
primitives_generated query. When both primitives_generated and the overflow
query are enabled simultaneously and XFB is disabled, it will be incorrect
again, but that had been equally incorrect with the non-NGG codepath too,
just not discovered because of the lack of tests.

This commit just changes NGG streamout queries to behave the same as legacy.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37849>
This commit is contained in:
Marek Olšák 2025-10-12 23:00:04 -04:00 committed by Marge Bot
parent 02db1fbe82
commit dd4df28ef2
6 changed files with 66 additions and 37 deletions

View file

@ -423,9 +423,6 @@ KHR-GL46.sparse_texture_clamp_tests.SparseTextureClampLookupColor_texture_1d_rgb
KHR-GL46.sparse_texture_clamp_tests.SparseTextureClampLookupColor_texture_1d_rgba8_snorm,Fail
KHR-GL46.sparse_texture_clamp_tests.SparseTextureClampLookupColor_texture_1d_rgba8i,Fail
KHR-GL46.sparse_texture_clamp_tests.SparseTextureClampLookupColor_texture_1d_rgba8ui,Fail
KHR-GL46.transform_feedback_overflow_query_ARB.advanced-single-stream-interleaved-attribs,Fail
KHR-GL46.transform_feedback_overflow_query_ARB.advanced-single-stream-separate-attribs,Fail
KHR-GL46.transform_feedback_overflow_query_ARB.multiple-streams-multiple-buffers-per-stream,Fail
KHR-GL46.transform_feedback_overflow_query_ARB.multiple-streams-one-buffer-per-stream,Fail
KHR-GL46.texture_query_lod.sampler1D_test,Fail
KHR-GL46.texture_query_lod.sampler2D_test,Fail

1 # LLVM 20-dev (da439d3af47b)
423 KHR-GL46.transform_feedback_overflow_query_ARB.advanced-single-stream-separate-attribs,Fail KHR-GL46.texture_query_lod.sampler1D_test,Fail
424 KHR-GL46.transform_feedback_overflow_query_ARB.multiple-streams-multiple-buffers-per-stream,Fail KHR-GL46.texture_query_lod.sampler2D_test,Fail
425 KHR-GL46.transform_feedback_overflow_query_ARB.multiple-streams-one-buffer-per-stream,Fail KHR-GL46.texture_query_lod.sampler3D_test,Fail
KHR-GL46.texture_query_lod.sampler1D_test,Fail
KHR-GL46.texture_query_lod.sampler2D_test,Fail
KHR-GL46.texture_query_lod.sampler3D_test,Fail
426 # escts failures (pass with ACO)
427 KHR-GLES3.shaders.uniform_block.random.nested_structs_instance_arrays.0,Fail
428 KHR-GLES31.core.shader_image_load_store.basic-allFormats-store-fs,Fail

View file

@ -30,10 +30,6 @@ spec@glsl-es-1.00@linker@glsl-mismatched-uniform-precision-unused,Fail
## Fail because GFX10+ removed MS texture support (see si_get_sparse_texture_virtual_page_size)
KHR-GL46.sparse_texture2_tests.SparseTexture2Allocation,Fail
KHR-GL46.sparse_texture2_tests.SparseTexture2Commitment,Fail
## https://gitlab.freedesktop.org/mesa/mesa/-/issues/636
KHR-GL46.transform_feedback_overflow_query_ARB.advanced-single-stream-interleaved-attribs,Fail
KHR-GL46.transform_feedback_overflow_query_ARB.advanced-single-stream-separate-attribs,Fail
KHR-GL46.transform_feedback_overflow_query_ARB.multiple-streams-multiple-buffers-per-stream,Fail
KHR-GL46.transform_feedback_overflow_query_ARB.multiple-streams-one-buffer-per-stream,Fail
# See Khronos issue 5587: the test expects one-dimensional (array) texture to work while

1 # piglit failures
30 KHR-GL46.sparse_texture2_tests.SparseTexture2Allocation,Fail
31 KHR-GL46.sparse_texture2_tests.SparseTexture2Commitment,Fail
32 ## https://gitlab.freedesktop.org/mesa/mesa/-/issues/636 KHR-GL46.transform_feedback_overflow_query_ARB.multiple-streams-one-buffer-per-stream,Fail
KHR-GL46.transform_feedback_overflow_query_ARB.advanced-single-stream-interleaved-attribs,Fail
KHR-GL46.transform_feedback_overflow_query_ARB.advanced-single-stream-separate-attribs,Fail
KHR-GL46.transform_feedback_overflow_query_ARB.multiple-streams-multiple-buffers-per-stream,Fail
KHR-GL46.transform_feedback_overflow_query_ARB.multiple-streams-one-buffer-per-stream,Fail
33 # See Khronos issue 5587: the test expects one-dimensional (array) texture to work while
34 # it's explicitely marked as non-supported by EXT_sparse_texture2.
35 KHR-GL46.sparse_texture2_tests.StandardPageSizesTestCase_texture_1d_array_r11f_g11f_b10f,Fail

View file

@ -109,7 +109,6 @@ success:
sbuf.buffer_offset = qbuf->head;
sbuf.buffer_size = sizeof(struct gfx11_sh_query_buffer_mem);
si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, &sbuf);
SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 1);
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
return true;
@ -135,8 +134,14 @@ static bool gfx11_sh_query_begin(struct si_context *sctx, struct si_query *rquer
query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx11_sh_query_buffer, list);
query->first_begin = query->first->head;
sctx->streamout.num_ngg_queries++;
query->first->refcount++;
si_update_prims_generated_query_state(sctx, query->b.type, 1);
/* Update num_ngg_streamout_queries. */
bool old_streamout_query_enable_state = si_get_streamout_enable_state(sctx);
sctx->streamout.num_ngg_queries++;
if (old_streamout_query_enable_state != si_get_streamout_enable_state(sctx))
si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
return true;
}
@ -161,11 +166,16 @@ static bool gfx11_sh_query_end(struct si_context *sctx, struct si_query *rquery)
0xffffffff, PIPE_QUERY_GPU_FINISHED);
}
si_update_prims_generated_query_state(sctx, query->b.type, -1);
/* Update num_ngg_streamout_queries. */
bool old_streamout_query_enable_state = si_get_streamout_enable_state(sctx);
sctx->streamout.num_ngg_queries--;
if (old_streamout_query_enable_state != si_get_streamout_enable_state(sctx))
si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
if (sctx->streamout.num_ngg_queries <= 0 || !si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) {
si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL);
SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 0);
/* If a query_begin is followed by a query_end without a draw
* in-between, we need to clear the atom to ensure that the

View file

@ -616,8 +616,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
if (ctx->gfx_level < GFX11)
si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
/* CLEAR_STATE disables all window rectangles. */
if (!has_clear_state || ctx->num_window_rectangles > 0)
si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);

View file

@ -1896,7 +1896,32 @@ static inline struct si_shader_ctx_state *si_get_vs(struct si_context *sctx)
static inline bool si_get_streamout_enable_state(struct si_context *sctx)
{
return sctx->streamout.streamout_enabled || sctx->streamout.prims_gen_query_enabled;
/* For GFX11, return whether NGG streamout queries are enabled. For older gens, return whether
* streamout hw is enabled.
*
* Note that when both PRIMITIVES_GENERATED and SO_OVERFLOW queries are enabled and XFB is
* disabled, SO_OVERFLOW queries will incorrectly return true because PRIMITIVES_GENERATED
* is incremented and PRIMITIVES_EMITTED is not. The problem is that SO_OVERFLOW queries
* are implemented by comparing PRIMITIVES_GENERATED and PRIMITIVES_EMITTED, however, when
* XFB is disabled, SO_OVERFLOW queries should increment neither PRIMITIVES_GENERATED nor
* PRIMITIVES_EMITTED, but when a separate PRIMITIVES_GENERATED is active, we should increment
* it. So the 2 queries are in conflict when XFB is disabled.
*
* Possible solutions:
* - For NGG: Emulate SO_OVERFLOW queries using memory stores separately from PRIMITIVES_GENERATED.
* - For legacy: Emulate SO_OVERFLOW queries using memory stores, same as NGG.
*/
if (sctx->gfx_level >= GFX11) {
/* Enable NGG streamout queries when PRIMITIVES_GENERATED queries are active or when
* streamout is enabled and any streamout queries except PRIMITIVES_GENERATED are active.
*/
return sctx->streamout.prims_gen_query_enabled ||
(sctx->streamout.streamout_enabled &&
(sctx->streamout.num_ngg_queries -
sctx->streamout.prims_gen_query_enabled > 0));
} else {
return sctx->streamout.streamout_enabled || sctx->streamout.prims_gen_query_enabled;
}
}
static inline unsigned si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size)

View file

@ -419,41 +419,45 @@ void si_emit_streamout_end(struct si_context *sctx)
static void si_emit_streamout_enable(struct si_context *sctx, unsigned index)
{
assert(sctx->gfx_level < GFX11);
radeon_begin(&sctx->gfx_cs);
radeon_set_context_reg_seq(R_028B94_VGT_STRMOUT_CONFIG, 2);
radeon_emit(S_028B94_STREAMOUT_0_EN(si_get_streamout_enable_state(sctx)) |
S_028B94_RAST_STREAM(0) |
S_028B94_STREAMOUT_1_EN(si_get_streamout_enable_state(sctx)) |
S_028B94_STREAMOUT_2_EN(si_get_streamout_enable_state(sctx)) |
S_028B94_STREAMOUT_3_EN(si_get_streamout_enable_state(sctx)));
radeon_emit(sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
radeon_end();
if (sctx->gfx_level >= GFX11) {
SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED,
si_get_streamout_enable_state(sctx));
} else {
radeon_begin(&sctx->gfx_cs);
radeon_set_context_reg_seq(R_028B94_VGT_STRMOUT_CONFIG, 2);
radeon_emit(S_028B94_STREAMOUT_0_EN(si_get_streamout_enable_state(sctx)) |
S_028B94_RAST_STREAM(0) |
S_028B94_STREAMOUT_1_EN(si_get_streamout_enable_state(sctx)) |
S_028B94_STREAMOUT_2_EN(si_get_streamout_enable_state(sctx)) |
S_028B94_STREAMOUT_3_EN(si_get_streamout_enable_state(sctx)));
radeon_emit(sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
radeon_end();
}
}
static void si_set_streamout_enable(struct si_context *sctx, bool enable)
{
if (sctx->gfx_level >= GFX11)
return;
bool old_strmout_en = si_get_streamout_enable_state(sctx);
unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
sctx->streamout.streamout_enabled = enable;
sctx->streamout.hw_enabled_mask =
sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) |
(sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12);
if ((old_strmout_en != si_get_streamout_enable_state(sctx)) ||
(old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))
if (old_strmout_en != si_get_streamout_enable_state(sctx))
si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
if (sctx->gfx_level < GFX11) {
sctx->streamout.hw_enabled_mask =
sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) |
(sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12);
if (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)
si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
}
}
void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff)
{
if (sctx->gfx_level < GFX11 && type == PIPE_QUERY_PRIMITIVES_GENERATED) {
if (type == PIPE_QUERY_PRIMITIVES_GENERATED) {
bool old_strmout_en = si_get_streamout_enable_state(sctx);
sctx->streamout.num_prims_gen_queries += diff;
@ -479,7 +483,5 @@ void si_init_streamout_functions(struct si_context *sctx)
sctx->b.stream_output_target_destroy = si_so_target_destroy;
sctx->b.set_stream_output_targets = si_set_streamout_targets;
sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
if (sctx->gfx_level < GFX11)
sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
}