From b9b00a0e7a309930d4a65b1c029caa67ca8bbef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 1 Jan 2024 19:24:34 -0500 Subject: [PATCH] ac,radeonsi: emulate GS primitive pipeline stat on gfx11 because of culling GS culls too, so the pipeline stat is incorrect. This can be exposed by forcing monolithic shader use, which makes culling shaders immediately available for tests to use. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/common/ac_nir.c | 13 ++++++++----- src/amd/common/ac_nir.h | 5 ++++- src/amd/common/ac_nir_lower_ngg.c | 3 ++- src/amd/vulkan/radv_shader.c | 2 ++ src/gallium/drivers/radeonsi/si_nir_lower_abi.c | 11 ++++++++--- src/gallium/drivers/radeonsi/si_query.c | 6 ++++++ src/gallium/drivers/radeonsi/si_shader.c | 2 ++ 7 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/amd/common/ac_nir.c b/src/amd/common/ac_nir.c index 27636d41ac3..f96f72ea339 100644 --- a/src/amd/common/ac_nir.c +++ b/src/amd/common/ac_nir.c @@ -898,7 +898,8 @@ ac_nir_accum_ior(nir_builder *b, nir_def *accum_result, nir_def *new_term) bool ac_nir_gs_shader_query(nir_builder *b, bool has_gen_prim_query, - bool has_pipeline_stats_query, + bool has_gs_invocations_query, + bool has_gs_primitives_query, unsigned num_vertices_per_primitive, unsigned wave_size, nir_def *vertex_count[4], @@ -913,7 +914,7 @@ ac_nir_gs_shader_query(nir_builder *b, any_query_enabled = ac_nir_accum_ior(b, any_query_enabled, prim_gen_query_enabled); } - if (has_pipeline_stats_query) { + if (has_gs_invocations_query || has_gs_primitives_query) { pipeline_query_enabled = nir_load_pipeline_stat_query_enabled_amd(b); any_query_enabled = ac_nir_accum_ior(b, any_query_enabled, pipeline_query_enabled); } @@ -959,7 +960,7 @@ ac_nir_gs_shader_query(nir_builder *b, /* Store the query result to query result using an atomic add. */ nir_if *if_first_lane = nir_push_if(b, nir_elect(b, 1)); { - if (has_pipeline_stats_query) { + if (has_gs_invocations_query || has_gs_primitives_query) { nir_if *if_pipeline_query = nir_push_if(b, pipeline_query_enabled); { nir_def *count = NULL; @@ -974,10 +975,11 @@ ac_nir_gs_shader_query(nir_builder *b, } } - if (count) + if (has_gs_primitives_query && count) nir_atomic_add_gs_emit_prim_count_amd(b, count); - nir_atomic_add_shader_invocation_count_amd(b, num_active_threads); + if (has_gs_invocations_query) + nir_atomic_add_shader_invocation_count_amd(b, num_active_threads); } nir_pop_if(b, if_pipeline_query); } @@ -1237,6 +1239,7 @@ ac_nir_lower_legacy_gs(nir_shader *nir, bool progress = ac_nir_gs_shader_query(b, has_gen_prim_query, has_pipeline_stats_query, + has_pipeline_stats_query, num_vertices_per_primitive, 64, s.vertex_count, diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h index 8699d30529d..011fd544ec6 100644 --- a/src/amd/common/ac_nir.h +++ b/src/amd/common/ac_nir.h @@ -166,6 +166,8 @@ typedef struct { bool disable_streamout; bool has_gen_prim_query; bool has_xfb_prim_query; + bool has_gs_invocations_query; + bool has_gs_primitives_query; bool kill_pointsize; bool kill_layer; bool force_vrs; @@ -268,7 +270,8 @@ ac_nir_lower_legacy_vs(nir_shader *nir, bool ac_nir_gs_shader_query(nir_builder *b, bool has_gen_prim_query, - bool has_pipeline_stats_query, + bool has_gs_invocations_query, + bool has_gs_primitives_query, unsigned num_vertices_per_primitive, unsigned wave_size, nir_def *vertex_count[4], diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index 74646f1c2bd..8e0e316009f 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -3573,7 +3573,8 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options) b->cursor = nir_after_cf_list(&if_gs_thread->then_list); ac_nir_gs_shader_query(b, state.options->has_gen_prim_query, - state.options->gfx_level < GFX11, + state.options->has_gs_invocations_query, + state.options->has_gs_primitives_query, state.num_vertices_per_primitive, state.options->wave_size, state.vertex_count, diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index db32f0a4224..821dde34bfa 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -856,6 +856,8 @@ radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage, options.disable_streamout = !device->physical_device->use_ngg_streamout; options.has_gen_prim_query = info->has_prim_query; options.has_xfb_prim_query = info->has_xfb_query; + options.has_gs_invocations_query = device->physical_device->rad_info.gfx_level < GFX11; + options.has_gs_primitives_query = device->physical_device->rad_info.gfx_level < GFX11; options.force_vrs = info->force_vrs_per_vertex; if (nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL) { diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index 8a99e9682b9..cf1f846842f 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -470,12 +470,17 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s } case nir_intrinsic_atomic_add_gs_emit_prim_count_amd: case nir_intrinsic_atomic_add_shader_invocation_count_amd: { - nir_def *buf = - si_nir_load_internal_binding(b, args, SI_GS_QUERY_EMULATED_COUNTERS_BUF, 4); - enum pipe_statistics_query_index index = intrin->intrinsic == nir_intrinsic_atomic_add_gs_emit_prim_count_amd ? PIPE_STAT_QUERY_GS_PRIMITIVES : PIPE_STAT_QUERY_GS_INVOCATIONS; + + /* GFX11 only needs to emulate PIPE_STAT_QUERY_GS_PRIMITIVES because GS culls, + * which makes the pipeline statistic incorrect. + */ + assert(sel->screen->info.gfx_level < GFX11 || index == PIPE_STAT_QUERY_GS_PRIMITIVES); + + nir_def *buf = + si_nir_load_internal_binding(b, args, SI_GS_QUERY_EMULATED_COUNTERS_BUF, 4); unsigned offset = si_query_pipestat_end_dw_offset(sel->screen, index) * 4; nir_def *count = intrin->src[0].ssa; diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index 93736254ca9..55e768d7c15 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -693,6 +693,12 @@ static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned if ((index == PIPE_STAT_QUERY_GS_PRIMITIVES || index == PIPE_STAT_QUERY_GS_INVOCATIONS) && sscreen->use_ngg && (sscreen->info.gfx_level >= GFX10 && sscreen->info.gfx_level <= GFX10_3)) query->flags |= SI_QUERY_EMULATE_GS_COUNTERS; + + /* GFX11 only emulates PIPE_STAT_QUERY_GS_PRIMITIVES because the shader culls, + * which makes the statistic incorrect. + */ + if (sscreen->info.gfx_level >= GFX11 && index == PIPE_STAT_QUERY_GS_PRIMITIVES) + query->flags |= SI_QUERY_EMULATE_GS_COUNTERS; break; default: assert(0); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 0645de43256..4fa430fdbbf 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1976,6 +1976,8 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir) options.gs_out_vtx_bytes = sel->info.gsvs_vertex_size; options.has_gen_prim_query = options.has_xfb_prim_query = sel->screen->info.gfx_level >= GFX11; + options.has_gs_invocations_query = sel->screen->info.gfx_level < GFX11; + options.has_gs_primitives_query = true; /* For monolithic ES/GS to add vscnt wait when GS export pos0. */ if (key->ge.part.gs.es)