radv: move nir_opt_algebraic loop for NGG culling earlier

Totals from 6913 (8.66% of 79825) affected shaders: (Navi21) Instrs: 5373319 -> 5358717 (-0.27%); split: -0.30%, +0.03% CodeSize: 27448536 -> 27345464 (-0.38%); split: -0.41%, +0.03% SpillSGPRs: 982 -> 998 (+1.63%) Latency: 22998827 -> 23011602 (+0.06%); split: -0.13%, +0.19% InvThroughput: 4663749 -> 4664809 (+0.02%); split: -0.00%, +0.03% VClause: 120845 -> 120461 (-0.32%); split: -0.49%, +0.17% SClause: 119068 -> 116064 (-2.52%); split: -2.71%, +0.18% Copies: 456590 -> 456450 (-0.03%); split: -0.19%, +0.16% Branches: 145555 -> 145559 (+0.00%); split: -0.00%, +0.01% PreSGPRs: 300465 -> 301154 (+0.23%); split: -0.01%, +0.24% VALU: 3064127 -> 3064210 (+0.00%); split: -0.00%, +0.00% SALU: 891257 -> 886368 (-0.55%); split: -0.71%, +0.16% SMEM: 190500 -> 184624 (-3.08%); split: -3.11%, +0.02% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36936>
2025-12-30 03:30:10 +01:00 · 2025-08-29 16:14:17 +01:00 · 2025-08-29 16:14:17 +01:00 · c63c695149
commit c63c695149
parent 3fe651f607
3 changed files with 39 additions and 18 deletions
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@ -433,6 +433,15 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
   if (constant_fold_for_push_const && stage->args.ac.inline_push_const_mask)
      NIR_PASS(_, stage->nir, nir_opt_constant_folding);

+   /* Optimize NIR before NGG culling */
+   bool is_last_vgt_stage = radv_is_last_vgt_stage(stage);
+   bool lowered_ngg = stage->info.is_ngg && is_last_vgt_stage;
+   if (lowered_ngg && stage->nir->info.stage != MESA_SHADER_GEOMETRY && stage->info.has_ngg_culling)
+      radv_optimize_nir_algebraic_early(stage->nir);
+
+   /* This has to be done after nir_opt_algebraic for best descriptor vectorization, but also before
+    * NGG culling.
+    */
   NIR_PASS(_, stage->nir, radv_nir_apply_pipeline_layout, device, stage);

   NIR_PASS(_, stage->nir, nir_lower_alu_width, opt_vectorize_callback, device);
@ -466,9 +475,7 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
   }

   /* Lower I/O intrinsics to memory instructions. */
-   bool is_last_vgt_stage = radv_is_last_vgt_stage(stage);
   bool io_to_mem = radv_nir_lower_io_to_mem(device, stage);
-   bool lowered_ngg = stage->info.is_ngg && is_last_vgt_stage;
   if (lowered_ngg) {
      radv_lower_ngg(device, stage, gfx_state);
   } else if (is_last_vgt_stage) {
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -234,7 +234,7 @@ radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively)
 }

 void
-radv_optimize_nir_algebraic(nir_shader *nir, bool opt_offsets, bool opt_mqsad, enum amd_gfx_level gfx_level)
+radv_optimize_nir_algebraic_early(nir_shader *nir)
 {
   bool more_algebraic = true;
   while (more_algebraic) {
@ -258,21 +258,11 @@ radv_optimize_nir_algebraic(nir_shader *nir, bool opt_offsets, bool opt_mqsad, e
      NIR_PASS(_, nir, nir_opt_remove_phis);
      NIR_PASS(_, nir, nir_opt_dead_cf);
   }
+}

-   if (opt_offsets) {
-      const nir_opt_offsets_options offset_options = {
-         .uniform_max = 0,
-         .buffer_max = ~0,
-         .shared_max = UINT16_MAX,
-         .shared_atomic_max = UINT16_MAX,
-         .allow_offset_wrap_cb = ac_nir_allow_offset_wrap_cb,
-         .cb_data = &gfx_level,
-      };
-      NIR_PASS(_, nir, nir_opt_offsets, &offset_options);
-   }
-   if (opt_mqsad)
-      NIR_PASS(_, nir, nir_opt_mqsad);
-
+void
+radv_optimize_nir_algebraic_late(nir_shader *nir)
+{
   /* Do late algebraic optimization to turn add(a,
    * neg(b)) back into subs, then the mandatory cleanup
    * after algebraic.  Note that it may produce fnegs,
@ -292,6 +282,28 @@ radv_optimize_nir_algebraic(nir_shader *nir, bool opt_offsets, bool opt_mqsad, e
   _mesa_set_destroy(skip, NULL);
 }

+void
+radv_optimize_nir_algebraic(nir_shader *nir, bool opt_offsets, bool opt_mqsad, enum amd_gfx_level gfx_level)
+{
+   radv_optimize_nir_algebraic_early(nir);
+
+   if (opt_offsets) {
+      const nir_opt_offsets_options offset_options = {
+         .uniform_max = 0,
+         .buffer_max = ~0,
+         .shared_max = UINT16_MAX,
+         .shared_atomic_max = UINT16_MAX,
+         .allow_offset_wrap_cb = ac_nir_allow_offset_wrap_cb,
+         .cb_data = &gfx_level,
+      };
+      NIR_PASS(_, nir, nir_opt_offsets, &offset_options);
+   }
+   if (opt_mqsad)
+      NIR_PASS(_, nir, nir_opt_mqsad);
+
+   radv_optimize_nir_algebraic_late(nir);
+}
+
 static void
 shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
 {
@ -913,7 +925,7 @@ radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage,
      assert(info->is_ngg);

      if (info->has_ngg_culling)
-         radv_optimize_nir_algebraic(nir, false, false, pdev->info.gfx_level);
+         radv_optimize_nir_algebraic_late(nir);

      options.num_vertices_per_primitive = num_vertices_per_prim;
      options.early_prim_export = info->has_ngg_early_prim_export;
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@ -487,6 +487,8 @@ struct radv_shader_dma_submission {
 struct radv_shader_stage;

 void radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively);
+void radv_optimize_nir_algebraic_early(nir_shader *shader);
+void radv_optimize_nir_algebraic_late(nir_shader *shader);
 void radv_optimize_nir_algebraic(nir_shader *shader, bool opt_offsets, bool opt_mqsad,
                                 enum amd_gfx_level gfx_level);