From f7305f776ed53dd8bac141ddbeeed7aacbff5270 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timur=20Krist=C3=B3f?= <timur.kristof@gmail.com>
Date: Mon, 27 Jan 2025 12:47:52 +0100
Subject: [PATCH] ac/nir/ngg: Pass radeon_info to mesh shader lowering.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same idea as the VS/TES and GS lowering:
Make shader compilation decisions based on the features of the
current GPU instead of ad-hoc deciding according to GFX level.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33218>
---
 src/amd/common/nir/ac_nir.h                | 20 +++++-----
 src/amd/common/nir/ac_nir_lower_ngg_mesh.c | 43 +++++++++++-----------
 src/amd/vulkan/radv_shader.c               |  2 +-
 3 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/src/amd/common/nir/ac_nir.h b/src/amd/common/nir/ac_nir.h
index e83c41ac435..6020015e1a1 100644
--- a/src/amd/common/nir/ac_nir.h
+++ b/src/amd/common/nir/ac_nir.h
@@ -183,16 +183,16 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options)
 
 void
 ac_nir_lower_ngg_mesh(nir_shader *shader,
-                    enum amd_gfx_level gfx_level,
-                    uint32_t clipdist_enable_mask,
-                    const uint8_t *vs_output_param_offset,
-                    bool has_param_exports,
-                    bool *out_needs_scratch_ring,
-                    unsigned wave_size,
-                    unsigned workgroup_size,
-                    bool multiview,
-                    bool has_query,
-                    bool fast_launch_2);
+                      const struct radeon_info *hw_info,
+                      uint32_t clipdist_enable_mask,
+                      const uint8_t *vs_output_param_offset,
+                      bool has_param_exports,
+                      bool *out_needs_scratch_ring,
+                      unsigned wave_size,
+                      unsigned workgroup_size,
+                      bool multiview,
+                      bool has_query,
+                      bool fast_launch_2);
 
 void
 ac_nir_lower_task_outputs_to_mem(nir_shader *shader,
diff --git a/src/amd/common/nir/ac_nir_lower_ngg_mesh.c b/src/amd/common/nir/ac_nir_lower_ngg_mesh.c
index e73c828c30d..f4db65951cd 100644
--- a/src/amd/common/nir/ac_nir_lower_ngg_mesh.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg_mesh.c
@@ -6,6 +6,7 @@
 
 #include "ac_nir.h"
 #include "ac_nir_helpers.h"
+#include "ac_gpu_info.h"
 
 #include "nir_builder.h"
 
@@ -89,7 +90,7 @@ typedef struct
 
 typedef struct
 {
-   enum amd_gfx_level gfx_level;
+   const struct radeon_info *hw_info;
    bool fast_launch_2;
    bool vert_multirow_export;
    bool prim_multirow_export;
@@ -799,7 +800,7 @@ ms_prim_exp_arg_ch1(nir_builder *b, nir_def *invocation_index, nir_def *num_vtx,
       indices[i] = nir_umin(b, indices[i], max_vtx_idx);
    }
 
-   return ac_nir_pack_ngg_prim_exp_arg(b, s->vertices_per_prim, indices, cull_flag, s->gfx_level);
+   return ac_nir_pack_ngg_prim_exp_arg(b, s->vertices_per_prim, indices, cull_flag, s->hw_info->gfx_level);
 }
 
 static nir_def *
@@ -826,7 +827,7 @@ ms_prim_exp_arg_ch2(nir_builder *b, uint64_t outputs_mask, lower_ngg_ms_state *s
 
       if (outputs_mask & VARYING_BIT_LAYER) {
          nir_def *layer =
-            nir_ishl_imm(b, s->out.outputs[VARYING_SLOT_LAYER][0], s->gfx_level >= GFX11 ? 0 : 17);
+            nir_ishl_imm(b, s->out.outputs[VARYING_SLOT_LAYER][0], s->hw_info->gfx_level >= GFX11 ? 0 : 17);
          prim_exp_arg_ch2 = nir_ior(b, prim_exp_arg_ch2, layer);
       }
 
@@ -890,7 +891,7 @@ emit_ms_vertex(nir_builder *b, nir_def *index, nir_def *row, bool exports, bool
    ms_emit_arrayed_outputs(b, index, per_vertex_outputs, s);
 
    if (exports) {
-      ac_nir_export_position(b, s->gfx_level, s->clipdist_enable_mask,
+      ac_nir_export_position(b, s->hw_info->gfx_level, s->clipdist_enable_mask,
                              !s->has_param_exports, false, true,
                              s->per_vertex_outputs | VARYING_BIT_POS, &s->out, row);
    }
@@ -899,12 +900,12 @@ emit_ms_vertex(nir_builder *b, nir_def *index, nir_def *row, bool exports, bool
       /* Export generic attributes on GFX10.3
        * (On GFX11 they are already stored in the attribute ring.)
        */
-      if (s->has_param_exports && s->gfx_level == GFX10_3) {
+      if (s->has_param_exports && s->hw_info->gfx_level == GFX10_3) {
          ac_nir_export_parameters(b, s->vs_output_param_offset, per_vertex_outputs, 0, &s->out);
       }
 
       /* GFX11+: also store special outputs to the attribute ring so PS can load them. */
-      if (s->gfx_level >= GFX11 && (per_vertex_outputs & MS_VERT_ARG_EXP_MASK))
+      if (s->hw_info->gfx_level >= GFX11 && (per_vertex_outputs & MS_VERT_ARG_EXP_MASK))
          ms_emit_attribute_ring_output_stores(b, per_vertex_outputs & MS_VERT_ARG_EXP_MASK, index, s);
    }
 }
@@ -937,12 +938,12 @@ emit_ms_primitive(nir_builder *b, nir_def *index, nir_def *row, bool exports, bo
       /* Export generic attributes on GFX10.3
        * (On GFX11 they are already stored in the attribute ring.)
        */
-      if (s->has_param_exports && s->gfx_level == GFX10_3) {
+      if (s->has_param_exports && s->hw_info->gfx_level == GFX10_3) {
          ac_nir_export_parameters(b, s->vs_output_param_offset, per_primitive_outputs, 0, &s->out);
       }
 
       /* GFX11+: also store special outputs to the attribute ring so PS can load them. */
-      if (s->gfx_level >= GFX11)
+      if (s->hw_info->gfx_level >= GFX11)
          ms_emit_attribute_ring_output_stores(b, per_primitive_outputs & MS_PRIM_ARG_EXP_MASK, index, s);
    }
 }
@@ -1045,7 +1046,7 @@ emit_ms_finale(nir_builder *b, lower_ngg_ms_state *s)
       (per_vertex_outputs & MS_VERT_ARG_EXP_MASK) ||
       (per_primitive_outputs & MS_PRIM_ARG_EXP_MASK);
 
-   const bool wait_attr_ring = must_wait_attr_ring(s->gfx_level, has_special_param_exports);
+   const bool wait_attr_ring = must_wait_attr_ring(s->hw_info->gfx_level, has_special_param_exports);
 
    /* Export vertices. */
    if ((per_vertex_outputs & ~VARYING_BIT_POS) || !wait_attr_ring) {
@@ -1350,16 +1351,16 @@ ms_calculate_output_layout(enum amd_gfx_level gfx_level, unsigned api_shared_siz
 
 void
 ac_nir_lower_ngg_mesh(nir_shader *shader,
-                    enum amd_gfx_level gfx_level,
-                    uint32_t clipdist_enable_mask,
-                    const uint8_t *vs_output_param_offset,
-                    bool has_param_exports,
-                    bool *out_needs_scratch_ring,
-                    unsigned wave_size,
-                    unsigned hw_workgroup_size,
-                    bool multiview,
-                    bool has_query,
-                    bool fast_launch_2)
+                      const struct radeon_info *hw_info,
+                      uint32_t clipdist_enable_mask,
+                      const uint8_t *vs_output_param_offset,
+                      bool has_param_exports,
+                      bool *out_needs_scratch_ring,
+                      unsigned wave_size,
+                      unsigned hw_workgroup_size,
+                      bool multiview,
+                      bool has_query,
+                      bool fast_launch_2)
 {
    unsigned vertices_per_prim =
       mesa_vertices_per_prim(shader->info.mesh.primitive_type);
@@ -1379,7 +1380,7 @@ ac_nir_lower_ngg_mesh(nir_shader *shader,
    unsigned max_primitives = shader->info.mesh.max_primitives_out;
 
    ms_out_mem_layout layout = ms_calculate_output_layout(
-      gfx_level, shader->info.shared_size, per_vertex_outputs, per_primitive_outputs,
+      hw_info->gfx_level, shader->info.shared_size, per_vertex_outputs, per_primitive_outputs,
       cross_invocation_access, max_vertices, max_primitives, vertices_per_prim);
 
    shader->info.shared_size = layout.lds.total_size;
@@ -1406,7 +1407,7 @@ ac_nir_lower_ngg_mesh(nir_shader *shader,
       .hw_workgroup_size = hw_workgroup_size,
       .insert_layer_output = multiview && !(shader->info.outputs_written & VARYING_BIT_LAYER),
       .uses_cull_flags = uses_cull,
-      .gfx_level = gfx_level,
+      .hw_info = hw_info,
       .fast_launch_2 = fast_launch_2,
       .vert_multirow_export = fast_launch_2 && max_vertices > hw_workgroup_size,
       .prim_multirow_export = fast_launch_2 && max_primitives > hw_workgroup_size,
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 96890b34ce6..00fb3a5c1ac 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -802,7 +802,7 @@ radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage,
       unsigned hw_workgroup_size = ALIGN(info->workgroup_size, info->wave_size);
 
       bool scratch_ring = false;
-      NIR_PASS_V(nir, ac_nir_lower_ngg_mesh, pdev->info.gfx_level, options.clip_cull_dist_mask,
+      NIR_PASS_V(nir, ac_nir_lower_ngg_mesh, &pdev->info, options.clip_cull_dist_mask,
                  options.vs_output_param_offset, options.has_param_exports, &scratch_ring, info->wave_size,
                  hw_workgroup_size, gfx_state->has_multiview_view_index, info->ms.has_query, pdev->mesh_fast_launch_2);
       ngg_stage->info.ms.needs_ms_scratch_ring = scratch_ring;