ac/nir/lower_ngg: add & use new scalar helpers for GS loads/stores

This simplifies the code and scalarizes the loads/stores. Scalar loads/stores will allow forwarding constant output components from stores to loads easily. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35352>
2026-05-05 07:28:11 +02:00 · 2025-05-29 09:14:36 -04:00 · 2025-05-29 09:14:36 -04:00 · 4b6ae11207
commit 4b6ae11207
parent f407129b7f
3 changed files with 57 additions and 73 deletions
--- a/src/amd/common/nir/ac_nir_helpers.h
+++ b/src/amd/common/nir/ac_nir_helpers.h
@ -228,13 +228,18 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b,
                                       nir_def *buffer_offsets_ret[4],
                                       nir_def *emit_prim_ret[4]);

-unsigned
-ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component);
-
 unsigned
 ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component,
                              bool data_is_16bit);

+void
+ac_nir_store_shared_gs_out(nir_builder *b, nir_def *value, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
+                           gl_varying_slot slot, unsigned component);
+
+nir_def *
+ac_nir_load_shared_gs_out(nir_builder *b, unsigned bit_size, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
+                          gl_varying_slot slot, unsigned component);
+
 void
 ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
                                  unsigned stream, nir_def *so_buffer[4],
--- a/src/amd/common/nir/ac_nir_lower_ngg_gs.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg_gs.c
@ -164,28 +164,14 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
    */
   u_foreach_bit64(slot, b->shader->info.outputs_written) {
      unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], stream);
-
      nir_def **output = s->out.outputs[slot];
-      nir_def *undef = nir_undef(b, 1, 32);

-      while (mask) {
-         int start, count;
-         u_bit_scan_consecutive_range(&mask, &start, &count);
-         nir_def *values[4] = {0};
-         for (int c = start; c < start + count; ++c) {
-            if (!output[c]) {
-               /* The shader hasn't written this output. */
-               values[c - start] = undef;
-            } else {
-               assert(output[c]->bit_size == 32);
-               values[c - start] = output[c];
-            }
-         }
+      u_foreach_bit(c, mask) {
+         /* The shader hasn't written this output yet. */
+         if (!output[c])
+            continue;

-         nir_def *store_val = nir_vec(b, values, (unsigned)count);
-         nir_store_shared(b, store_val, gs_emit_vtx_addr,
-                          .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, slot, start),
-                          .align_mul = 4);
+         ac_nir_store_shared_gs_out(b, output[c], gs_emit_vtx_addr, &s->out, slot, c);
      }

      /* Clear all outputs (they are undefined after emit_vertex) */
@ -202,21 +188,16 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
      nir_def **output_hi = s->out.outputs_16bit_hi[slot];
      nir_def *undef = nir_undef(b, 1, 16);

-      while (mask) {
-         int start, count;
-         u_bit_scan_consecutive_range(&mask, &start, &count);
-         nir_def *values[4] = {0};
-         for (int c = start; c < start + count; ++c) {
-            nir_def *lo = output_lo[c] ? output_lo[c] : undef;
-            nir_def *hi = output_hi[c] ? output_hi[c] : undef;
+      u_foreach_bit(c, mask) {
+         /* The shader hasn't written this output yet. */
+         if (!output_lo[c] && !output_hi[c])
+            continue;

-            values[c - start] = nir_pack_32_2x16_split(b, lo, hi);
-         }
+         nir_def *lo = output_lo[c] ? output_lo[c] : undef;
+         nir_def *hi = output_hi[c] ? output_hi[c] : undef;
+         nir_def *store_val = nir_pack_32_2x16_split(b, lo, hi);

-         nir_def *store_val = nir_vec(b, values, (unsigned)count);
-         nir_store_shared(b, store_val, gs_emit_vtx_addr,
-                          .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_VAR0_16BIT + slot, start),
-                          .align_mul = 4);
+         ac_nir_store_shared_gs_out(b, store_val, gs_emit_vtx_addr, &s->out, VARYING_SLOT_VAR0_16BIT + slot, c);
      }

      /* Clear all outputs (they are undefined after emit_vertex) */
@ -383,16 +364,9 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
   u_foreach_bit64(slot, b->shader->info.outputs_written) {
      unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], 0);

-      while (mask) {
-         int start, count;
-         u_bit_scan_consecutive_range(&mask, &start, &count);
-         nir_def *load =
-            nir_load_shared(b, count, 32, exported_out_vtx_lds_addr,
-                            .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, slot, start),
-                            .align_mul = 4);
-
-         for (int i = 0; i < count; i++)
-            s->out.outputs[slot][start + i] = nir_channel(b, load, i);
+      u_foreach_bit(c, mask) {
+         s->out.outputs[slot][c] = ac_nir_load_shared_gs_out(b, 32, exported_out_vtx_lds_addr,
+                                                             &s->out, slot, c);
      }
   }

@ -402,24 +376,15 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
      const unsigned mask_hi = gs_output_component_mask_with_stream(&s->out.infos_16bit_hi[i], 0);
      unsigned mask = mask_lo | mask_hi;

-      while (mask) {
-         int start, count;
-         u_bit_scan_consecutive_range(&mask, &start, &count);
-         nir_def *load =
-            nir_load_shared(b, count, 32, exported_out_vtx_lds_addr,
-                            .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_VAR0_16BIT + i, start),
-                            .align_mul = 4);
+      u_foreach_bit(c, mask) {
+         nir_def *load_val = ac_nir_load_shared_gs_out(b, 32, exported_out_vtx_lds_addr,
+                                                       &s->out, VARYING_SLOT_VAR0_16BIT + i, c);

-         for (int j = 0; j < count; j++) {
-            nir_def *val = nir_channel(b, load, j);
-            unsigned comp = start + j;
+         if (mask_lo & BITFIELD_BIT(c))
+            s->out.outputs_16bit_lo[i][c] = nir_unpack_32_2x16_split_x(b, load_val);

-            if (mask_lo & BITFIELD_BIT(comp))
-               s->out.outputs_16bit_lo[i][comp] = nir_unpack_32_2x16_split_x(b, val);
-
-            if (mask_hi & BITFIELD_BIT(comp))
-               s->out.outputs_16bit_hi[i][comp] = nir_unpack_32_2x16_split_y(b, val);
-         }
+         if (mask_hi & BITFIELD_BIT(c))
+            s->out.outputs_16bit_hi[i][c] = nir_unpack_32_2x16_split_y(b, load_val);
      }
   }

@ -630,11 +595,8 @@ ngg_gs_cull_primitive(nir_builder *b, nir_def *tid_in_tg, nir_def *max_vtxcnt,

      for (unsigned i = 0; i < s->num_vertices_per_primitive; i++) {
         /* Load X, Y, W position components. Z is loaded only if we clip against POS. */
-         for (unsigned c = 0; c < 4; c == 1 && !clip_against_pos ? c += 2 : c++) {
-            pos[i][c] = nir_load_shared(b, 1, 32, vtxptr[i],
-                                        .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_POS, c),
-                                        .align_mul = 4);
-         }
+         for (unsigned c = 0; c < 4; c == 1 && !clip_against_pos ? c += 2 : c++)
+            pos[i][c] = ac_nir_load_shared_gs_out(b, 32, vtxptr[i], &s->out, VARYING_SLOT_POS, c);
      }

      nir_def *accepted_by_clipdist = nir_imm_true(b);
@ -650,8 +612,8 @@ ngg_gs_cull_primitive(nir_builder *b, nir_def *tid_in_tg, nir_def *max_vtxcnt,
            if (!clip_against_pos) {
               for (unsigned i = 0; i < s->num_vertices_per_primitive; i++) {
                  for (unsigned c = 0; c < 4; c++) {
-                     unsigned offset = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_CLIP_VERTEX, c);
-                     clipvertex[i][c] = nir_load_shared(b, 1, 32, vtxptr[i], .base = offset, .align_mul = 4);
+                     clipvertex[i][c] = ac_nir_load_shared_gs_out(b, 32, vtxptr[i], &s->out,
+                                                                  VARYING_SLOT_CLIP_VERTEX, c);
                  }
               }
            }
@ -668,10 +630,10 @@ ngg_gs_cull_primitive(nir_builder *b, nir_def *tid_in_tg, nir_def *max_vtxcnt,
         } else {
            /* Load clip distances. */
            u_foreach_bit(c, s->options->cull_clipdist_mask) {
-               unsigned offset = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_CLIP_DIST0 + c / 4, c % 4);
-
-               for (unsigned i = 0; i < s->num_vertices_per_primitive; i++)
-                  clipdist[i][c] = nir_load_shared(b, 1, 32, vtxptr[i], .base = offset, .align_mul = 4);
+               for (unsigned i = 0; i < s->num_vertices_per_primitive; i++) {
+                  clipdist[i][c] = ac_nir_load_shared_gs_out(b, 32, vtxptr[i], &s->out,
+                                                             VARYING_SLOT_CLIP_DIST0 + c / 4, c % 4);
+               }
            }
         }

--- a/src/amd/common/nir/ac_nir_prerast_utils.c
+++ b/src/amd/common/nir/ac_nir_prerast_utils.c
@ -1310,7 +1310,7 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b,
   }
 }

-unsigned
+static unsigned
 ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component)
 {
   assert(component < 4);
@ -1357,6 +1357,23 @@ ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot,
   return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4;
 }

+void
+ac_nir_store_shared_gs_out(nir_builder *b, nir_def *value, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
+                           gl_varying_slot slot, unsigned component)
+{
+   assert(value->num_components == 1);
+   unsigned offset = ac_nir_get_lds_gs_out_slot_offset(pr_out, slot, component);
+   nir_store_shared(b, value, vtxptr, .base = offset, .align_mul = 4);
+}
+
+nir_def *
+ac_nir_load_shared_gs_out(nir_builder *b, unsigned bit_size, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
+                          gl_varying_slot slot, unsigned component)
+{
+   unsigned offset = ac_nir_get_lds_gs_out_slot_offset(pr_out, slot, component);
+   return nir_load_shared(b, 1, bit_size, vtxptr, .base = offset, .align_mul = 4);
+}
+
 void
 ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
                                  unsigned stream, nir_def *so_buffer[4],