ac/nir/lower_ngg: pack GS outputs and XFB outputs in LDS optimally

This switches the code to the new slot offsets from ac_nir_prerast_out instead of using a prefix bitmask over outputs_written. The LDS layout no longer includes these: - GS: output components that are not written by GS - VS/TES+XFB: output components that are not written by XFB - VS/TES+XFB: slots that are not written by XFB (this could be significant) This is also a cleanup because it unduplicates the bitcounts. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35351>
2026-01-22 10:40:22 +01:00 · 2025-05-27 09:01:34 -04:00 · 2025-05-27 09:01:34 -04:00 · 814990684d
commit 814990684d
parent 75b1602c14
4 changed files with 116 additions and 125 deletions
--- a/src/amd/common/nir/ac_nir_helpers.h
+++ b/src/amd/common/nir/ac_nir_helpers.h
@ -228,13 +228,19 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b,
                                       nir_def *buffer_offsets_ret[4],
                                       nir_def *emit_prim_ret[4]);

+unsigned
+ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component);
+
+unsigned
+ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component,
+                              bool data_is_16bit);
+
 void
 ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
                                  unsigned stream, nir_def *so_buffer[4],
                                  nir_def *buffer_offsets[4],
                                  unsigned vertex_index, nir_def *vtx_lds_addr,
-                                  ac_nir_prerast_out *pr_out,
-                                  bool skip_primitive_id);
+                                  ac_nir_prerast_out *pr_out);

 void
 ac_nir_repack_invocations_in_workgroup(nir_builder *b, nir_def **input_bool,
--- a/src/amd/common/nir/ac_nir_lower_ngg.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg.c
@ -70,7 +70,6 @@ typedef struct
   bool early_prim_export;
   bool streamout_enabled;
   bool has_user_edgeflags;
-   bool skip_primitive_id;
   unsigned max_num_waves;

   /* LDS params */
@ -205,18 +204,12 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *s, nir_def *arg)
         unsigned edge_flag_bits = ac_get_all_edge_flag_bits(s->options->hw_info->gfx_level);
         nir_def *mask = nir_imm_intN_t(b, ~edge_flag_bits, 32);

-         unsigned edge_flag_offset = 0;
-         if (s->streamout_enabled) {
-            unsigned packed_location =
-               util_bitcount64(b->shader->info.outputs_written &
-                               BITFIELD64_MASK(VARYING_SLOT_EDGE));
-            edge_flag_offset = packed_location * 16;
-         }
-
         for (int i = 0; i < s->options->num_vertices_per_primitive; i++) {
            nir_def *vtx_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]);
            nir_def *addr = pervertex_lds_addr(b, vtx_idx, s->pervertex_lds_bytes);
-            nir_def *edge = nir_load_shared(b, 1, 32, addr, .base = edge_flag_offset);
+            /* Edge flags share LDS with XFB. */
+            unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_EDGE, 0, false);
+            nir_def *edge = nir_load_shared(b, 1, 32, addr, .base = offset);

            if (s->options->hw_info->gfx_level >= GFX12)
               mask = nir_ior(b, mask, nir_ishl_imm(b, edge, 8 + i * 9));
@ -1310,18 +1303,12 @@ ngg_nogs_store_edgeflag_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
   nir_def *edgeflag = s->out.outputs[VARYING_SLOT_EDGE][0];
   edgeflag = nir_umin(b, edgeflag, nir_imm_int(b, 1));

-   /* user edge flag is stored at the beginning of a vertex if streamout is not enabled */
-   unsigned offset = 0;
-   if (s->streamout_enabled) {
-      unsigned packed_location =
-         util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(VARYING_SLOT_EDGE));
-      offset = packed_location * 16;
-   }
-
   nir_def *tid = nir_load_local_invocation_index(b);
   nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);

-   nir_store_shared(b, edgeflag, addr, .base = offset);
+   /* Edge flags share LDS with XFB. */
+   nir_store_shared(b, edgeflag, addr,
+                    .base = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_EDGE, 0, false));
 }

 static void
@ -1357,12 +1344,6 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
   nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);

   u_foreach_bit64(slot, xfb_outputs) {
-      uint64_t outputs_written = b->shader->info.outputs_written;
-      if (s->skip_primitive_id)
-         outputs_written &= ~VARYING_BIT_PRIMITIVE_ID;
-      unsigned packed_location =
-         util_bitcount64(outputs_written & BITFIELD64_MASK(slot));
-
      unsigned mask = xfb_mask[slot];

      /* Clear unused components. */
@ -1381,15 +1362,13 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
          *   OpenGL puts 16bit outputs in VARYING_SLOT_VAR0_16BIT.
          */
         nir_def *store_val = nir_vec(b, &s->out.outputs[slot][start], (unsigned)count);
-         nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
+         unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, slot, start,
+                                                         store_val->bit_size == 16);
+         nir_store_shared(b, store_val, addr, .base = offset, .align_mul = 4);
      }
   }

-   unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
   u_foreach_bit64(slot, xfb_outputs_16bit) {
-      unsigned packed_location = num_32bit_outputs +
-         util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot));
-
      unsigned mask_lo = xfb_mask_16bit_lo[slot];
      unsigned mask_hi = xfb_mask_16bit_hi[slot];

@ -1420,7 +1399,9 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
         }

         nir_def *store_val = nir_vec(b, values, (unsigned)count);
-         nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
+         unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_VAR0_16BIT + slot,
+                                                         start, true);
+         nir_store_shared(b, store_val, addr, .base = offset);
      }
   }
 }
@ -1448,7 +1429,6 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s)
   /* Write out primitive data */
   nir_if *if_emit = nir_push_if(b, nir_ilt(b, tid_in_tg, emit_prim_per_stream[0]));
   {
-      unsigned vtx_lds_stride = (b->shader->num_outputs * 4 + 1) * 4;
      nir_def *num_vert_per_prim = nir_load_num_vertices_per_primitive_amd(b);
      nir_def *first_vertex_idx = nir_imul(b, tid_in_tg, num_vert_per_prim);

@ -1463,9 +1443,9 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s)
            nir_push_if(b, nir_igt_imm(b, num_vert_per_prim, i));
         {
            nir_def *vtx_lds_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]);
-            nir_def *vtx_lds_addr = pervertex_lds_addr(b, vtx_lds_idx, vtx_lds_stride);
+            nir_def *vtx_lds_addr = pervertex_lds_addr(b, vtx_lds_idx, s->pervertex_lds_bytes);
            ac_nir_ngg_build_streamout_vertex(b, info, 0, so_buffer, buffer_offsets, i,
-                                       vtx_lds_addr, &s->out, s->skip_primitive_id);
+                                              vtx_lds_addr, &s->out);
         }
         nir_pop_if(b, if_valid_vertex);
      }
@ -1488,40 +1468,27 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s)
 }

 static unsigned
-ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
-                                unsigned shader_num_outputs,
+ngg_nogs_get_pervertex_lds_size(lower_ngg_nogs_state *s,
+                                gl_shader_stage stage,
                                bool streamout_enabled,
                                bool export_prim_id,
                                bool has_user_edgeflags)
 {
-   unsigned pervertex_lds_bytes = 0;
-
-   if (streamout_enabled) {
-      /* The extra dword is used to avoid LDS bank conflicts and store the primitive id.
-       * TODO: only alloc space for outputs that really need streamout.
-       */
-      pervertex_lds_bytes = (shader_num_outputs * 4 + 1) * 4;
-   }
-
   bool need_prim_id_store_shared = export_prim_id && stage == MESA_SHADER_VERTEX;
-   if (need_prim_id_store_shared || has_user_edgeflags) {
-      unsigned size = 0;
-      if (need_prim_id_store_shared)
-         size += 4;
-      if (has_user_edgeflags)
-         size += 4;
+   unsigned xfb_size = streamout_enabled ? s->out.total_packed_xfb_lds_size : 0;
+   unsigned non_xfb_size = ((int)has_user_edgeflags + (int)need_prim_id_store_shared) * 4;
+   unsigned pervertex_lds_bytes = MAX2(xfb_size, non_xfb_size);

-      /* pad to odd dwords to avoid LDS bank conflict */
-      size |= 4;
-
-      pervertex_lds_bytes = MAX2(pervertex_lds_bytes, size);
-   }
+   /* Or 0x4 to make the size an odd number of dwords to reduce LDS bank conflicts. */
+   if (pervertex_lds_bytes)
+      pervertex_lds_bytes |= 0x4;

   return pervertex_lds_bytes;
 }

 static void
-ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nogs_state *s)
+ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nogs_state *s,
+                        bool gather_values)
 {
   /* Assume:
    * - the shader used nir_lower_io_vars_to_temporaries
@ -1541,15 +1508,19 @@ ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nog
         if (intrin->intrinsic != nir_intrinsic_store_output)
            continue;

-         ac_nir_gather_prerast_store_output_info(b, intrin, &s->out, true);
-         nir_instr_remove(instr);
+         ac_nir_gather_prerast_store_output_info(b, intrin, &s->out, gather_values);
+         if (gather_values)
+            nir_instr_remove(instr);
      }
   }
+
+   if (!gather_values)
+      ac_nir_compute_prerast_packed_output_info(&s->out);
 }

 static unsigned
-ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
-                                   unsigned shader_num_outputs,
+ac_ngg_nogs_get_pervertex_lds_size(lower_ngg_nogs_state *s,
+                                   gl_shader_stage stage,
                                   bool streamout_enabled,
                                   bool export_prim_id,
                                   bool has_user_edgeflags,
@ -1563,8 +1534,7 @@ ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
         stage, uses_instance_id, uses_tess_primitive_id, NULL) : 0;

   unsigned pervertex_lds_bytes =
-      ngg_nogs_get_pervertex_lds_size(stage, shader_num_outputs, streamout_enabled,
-                                      export_prim_id, has_user_edgeflags);
+      ngg_nogs_get_pervertex_lds_size(s, stage, streamout_enabled, export_prim_id, has_user_edgeflags);

   return MAX2(culling_pervertex_lds_bytes, pervertex_lds_bytes);
 }
@ -1617,7 +1587,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
      .gs_exported_var = gs_exported_var,
      .max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size),
      .has_user_edgeflags = has_user_edgeflags,
-      .skip_primitive_id = streamout_enabled && (options->export_primitive_id || options->export_primitive_id_per_prim),
   };

   /* Can't export the primitive ID both as per-vertex and per-primitive. */
@ -1644,6 +1613,8 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
      save_reusable_variables(b, &state);
   }

+   ngg_nogs_gather_outputs(b, &impl->body, &state, false);
+
   nir_cf_list *extracted = rzalloc(shader, nir_cf_list);
   nir_cf_extract(extracted, nir_before_impl(impl),
                  nir_after_impl(impl));
@ -1706,8 +1677,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option

   /* determine the LDS vertex stride */
   state.pervertex_lds_bytes =
-      ngg_nogs_get_pervertex_lds_size(shader->info.stage,
-                                      shader->num_outputs,
+      ngg_nogs_get_pervertex_lds_size(&state, shader->info.stage,
                                      state.streamout_enabled,
                                      options->export_primitive_id,
                                      state.has_user_edgeflags);
@ -1748,7 +1718,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
   nir_pop_if(b, if_es_thread);

   /* Gather outputs data and types */
-   ngg_nogs_gather_outputs(b, &if_es_thread->then_list, &state);
+   ngg_nogs_gather_outputs(b, &if_es_thread->then_list, &state, true);
   b->cursor = nir_after_cf_list(&if_es_thread->then_list);

   /* This should be after streamout and before exports. */
@ -1882,7 +1852,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
   } while (progress);

   *out_lds_vertex_size =
-      ac_ngg_nogs_get_pervertex_lds_size(shader->info.stage, shader->num_outputs, state.streamout_enabled,
+      ac_ngg_nogs_get_pervertex_lds_size(&state, shader->info.stage, state.streamout_enabled,
                                         options->export_primitive_id, state.has_user_edgeflags,
                                         options->can_cull, state.deferred.uses_instance_id,
                                         state.deferred.uses_tess_primitive_id);
--- a/src/amd/common/nir/ac_nir_lower_ngg_gs.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg_gs.c
@ -163,7 +163,6 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
    * In case of packed 16-bit, we assume that has been already packed into 32 bit slots by now.
    */
   u_foreach_bit64(slot, b->shader->info.outputs_written) {
-      const unsigned packed_location = util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot)));
      unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], stream);

      nir_def **output = s->out.outputs[slot];
@ -185,7 +184,7 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri

         nir_def *store_val = nir_vec(b, values, (unsigned)count);
         nir_store_shared(b, store_val, gs_emit_vtx_addr,
-                          .base = packed_location * 16 + start * 4,
+                          .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, slot, start),
                          .align_mul = 4);
      }

@ -193,13 +192,8 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
      memset(s->out.outputs[slot], 0, sizeof(s->out.outputs[slot]));
   }

-   const unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
-
   /* Store dedicated 16-bit outputs to LDS. */
   u_foreach_bit(slot, b->shader->info.outputs_written_16bit) {
-      const unsigned packed_location = num_32bit_outputs +
-         util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot));
-
      const unsigned mask_lo = gs_output_component_mask_with_stream(s->out.infos_16bit_lo + slot, stream);
      const unsigned mask_hi = gs_output_component_mask_with_stream(s->out.infos_16bit_hi + slot, stream);
      unsigned mask = mask_lo | mask_hi;
@ -221,7 +215,7 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri

         nir_def *store_val = nir_vec(b, values, (unsigned)count);
         nir_store_shared(b, store_val, gs_emit_vtx_addr,
-                          .base = packed_location * 16 + start * 4,
+                          .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_VAR0_16BIT + slot, start),
                          .align_mul = 4);
      }

@ -387,9 +381,6 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
   }

   u_foreach_bit64(slot, b->shader->info.outputs_written) {
-      const unsigned packed_location =
-         util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot)));
-
      unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], 0);

      while (mask) {
@ -397,7 +388,7 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
         u_bit_scan_consecutive_range(&mask, &start, &count);
         nir_def *load =
            nir_load_shared(b, count, 32, exported_out_vtx_lds_addr,
-                            .base = packed_location * 16 + start * 4,
+                            .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, slot, start),
                            .align_mul = 4);

         for (int i = 0; i < count; i++)
@ -405,13 +396,8 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
      }
   }

-   const unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
-
   /* Dedicated 16-bit outputs. */
   u_foreach_bit(i, b->shader->info.outputs_written_16bit) {
-      const unsigned packed_location = num_32bit_outputs +
-         util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(i));
-
      const unsigned mask_lo = gs_output_component_mask_with_stream(&s->out.infos_16bit_lo[i], 0);
      const unsigned mask_hi = gs_output_component_mask_with_stream(&s->out.infos_16bit_hi[i], 0);
      unsigned mask = mask_lo | mask_hi;
@ -421,7 +407,7 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
         u_bit_scan_consecutive_range(&mask, &start, &count);
         nir_def *load =
            nir_load_shared(b, count, 32, exported_out_vtx_lds_addr,
-                            .base = packed_location * 16 + start * 4,
+                            .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_VAR0_16BIT + i, start),
                            .align_mul = 4);

         for (int j = 0; j < count; j++) {
@ -639,11 +625,12 @@ ngg_gs_cull_primitive(nir_builder *b, nir_def *tid_in_tg, nir_def *max_vtxcnt,
      /* Load the positions from LDS. */
      nir_def *pos[3][4];
      for (unsigned i = 0; i < s->num_vertices_per_primitive; i++) {
-         /* VARYING_SLOT_POS == 0, so base won't count packed location */
-         pos[i][3] = nir_load_shared(b, 1, 32, vtxptr[i], .base = 12); /* W */
-         nir_def *xy = nir_load_shared(b, 2, 32, vtxptr[i], .base = 0, .align_mul = 4);
-         pos[i][0] = nir_channel(b, xy, 0);
-         pos[i][1] = nir_channel(b, xy, 1);
+         /* Load X, Y, W position components. */
+         for (unsigned c = 0; c < 4; c == 1 ? c += 2 : c++) {
+            pos[i][c] = nir_load_shared(b, 1, 32, vtxptr[i],
+                                        .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_POS, c),
+                                        .align_mul = 4);
+         }

         pos[i][0] = nir_fdiv(b, pos[i][0], pos[i][3]);
         pos[i][1] = nir_fdiv(b, pos[i][1], pos[i][3]);
@ -806,7 +793,7 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
            ac_nir_ngg_build_streamout_vertex(b, info, stream, so_buffer,
                                       stream_buffer_offsets, i,
                                       exported_vtx_lds_addr[i],
-                                       &s->out, false);
+                                       &s->out);
         }
      }
      nir_pop_if(b, if_emit);
@ -903,16 +890,10 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options,
   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
   assert(impl);

-   /* Add 4 for primflags. */
-   *out_lds_vertex_size = (util_bitcount64(shader->info.outputs_written) +
-                           util_bitcount(shader->info.outputs_written_16bit)) * 16 + 4;
-
   lower_ngg_gs_state state = {
      .options = options,
      .impl = impl,
      .max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size),
-      .lds_offs_primflags = *out_lds_vertex_size - 4,
-      .lds_bytes_per_gs_out_vertex = *out_lds_vertex_size,
      .streamout_enabled = shader->xfb_info && !options->disable_streamout,
   };

@ -962,6 +943,11 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options,
    * gathers output values, which must be done while lowering GS intrinsics and not before.
    */
   gather_output_stores(shader, &state);
+   ac_nir_compute_prerast_packed_output_info(&state.out);
+   state.lds_offs_primflags = state.out.total_packed_gs_out_size;
+   /* Make the vertex size in LDS be an odd number of dwords (| 0x4) to reduce LDS bank conflicts. */
+   state.lds_bytes_per_gs_out_vertex = (state.out.total_packed_gs_out_size + 4 /*primflags*/) | 0x4;
+
   lower_ngg_gs_intrinsics(shader, &state);

   if (state.streamout_enabled)
@ -995,5 +981,7 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options,
   /* Cleanup */
   nir_lower_vars_to_ssa(shader);
   nir_remove_dead_variables(shader, nir_var_function_temp, NULL);
+
+   *out_lds_vertex_size = state.lds_bytes_per_gs_out_vertex;
   return nir_progress(true, impl, nir_metadata_none);
 }
--- a/src/amd/common/nir/ac_nir_prerast_utils.c
+++ b/src/amd/common/nir/ac_nir_prerast_utils.c
@ -1310,13 +1310,59 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b,
   }
 }

+unsigned
+ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component)
+{
+   assert(component < 4);
+   unsigned lds_slot_offset, lds_component_mask;
+
+   if (slot >= VARYING_SLOT_VAR0_16BIT) {
+      unsigned i = slot - VARYING_SLOT_VAR0_16BIT;
+      assert(pr_out->infos_16bit_lo[i].packed_slot_gs_out_offset ==
+             pr_out->infos_16bit_hi[i].packed_slot_gs_out_offset);
+
+      lds_slot_offset = pr_out->infos_16bit_lo[i].packed_slot_gs_out_offset;
+      lds_component_mask = pr_out->infos_16bit_lo[i].components_mask |
+                           pr_out->infos_16bit_hi[i].components_mask;
+   } else {
+      lds_slot_offset = pr_out->infos[slot].packed_slot_gs_out_offset;
+      lds_component_mask = pr_out->infos[slot].components_mask;
+   }
+
+   return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4;
+}
+
+unsigned
+ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component,
+                              bool data_is_16bit)
+{
+   assert(component < 4);
+   unsigned lds_slot_offset = 0, lds_component_mask = 0;
+
+   if (slot >= VARYING_SLOT_VAR0_16BIT) {
+      unsigned i = slot - VARYING_SLOT_VAR0_16BIT;
+      assert(pr_out->infos_16bit_lo[i].packed_slot_xfb_lds_offset ==
+             pr_out->infos_16bit_hi[i].packed_slot_xfb_lds_offset);
+
+      lds_slot_offset = pr_out->infos_16bit_lo[i].packed_slot_xfb_lds_offset;
+      lds_component_mask = pr_out->infos_16bit_lo[i].xfb_lds_components_mask |
+                           pr_out->infos_16bit_hi[i].xfb_lds_components_mask;
+   } else if (data_is_16bit) {
+      assert(!"unimplemented");
+   } else {
+      lds_slot_offset = pr_out->infos[slot].packed_slot_xfb_lds_offset;
+      lds_component_mask = pr_out->infos[slot].xfb_lds_components_mask;
+   }
+
+   return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4;
+}
+
 void
 ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
                                  unsigned stream, nir_def *so_buffer[4],
                                  nir_def *buffer_offsets[4],
                                  unsigned vertex_index, nir_def *vtx_lds_addr,
-                                  ac_nir_prerast_out *pr_out,
-                                  bool skip_primitive_id)
+                                  ac_nir_prerast_out *pr_out)
 {
   unsigned vertex_offset[NIR_MAX_XFB_BUFFERS] = {0};

@ -1337,32 +1383,13 @@ ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
      if (!out->component_mask || info->buffer_to_stream[out->buffer] != stream)
         continue;

-      unsigned base;
-      if (out->location >= VARYING_SLOT_VAR0_16BIT) {
-         base =
-            util_bitcount64(b->shader->info.outputs_written) +
-            util_bitcount(b->shader->info.outputs_written_16bit &
-                          BITFIELD_MASK(out->location - VARYING_SLOT_VAR0_16BIT));
-      } else {
-         uint64_t outputs_written = b->shader->info.outputs_written;
-         if (skip_primitive_id)
-            outputs_written &= ~VARYING_BIT_PRIMITIVE_ID;
-
-         base =
-            util_bitcount64(outputs_written &
-                            BITFIELD64_MASK(out->location));
-      }
-
-      unsigned offset = (base * 4 + out->component_offset) * 4;
      unsigned count = util_bitcount(out->component_mask);

-      assert(u_bit_consecutive(out->component_offset, count) == out->component_mask);
-
-      nir_def *out_data =
-         nir_load_shared(b, count, 32, vtx_lds_addr, .base = offset);
-
      for (unsigned comp = 0; comp < count; comp++) {
-         nir_def *data = nir_channel(b, out_data, comp);
+         unsigned offset = ac_nir_ngg_get_xfb_lds_offset(pr_out, out->location,
+                                                         out->component_offset + comp,
+                                                         out->data_is_16bit);
+         nir_def *data = nir_load_shared(b, 1, 32, vtx_lds_addr, .base = offset, .align_mul = 4);

         /* Convert 16-bit outputs to 32-bit.
          *