ac/nir/lower_ngg: pack GS outputs and XFB outputs in LDS optimally

This switches the code to the new slot offsets from ac_nir_prerast_out
instead of using a prefix bitmask over outputs_written.

The LDS layout no longer includes these:
- GS: output components that are not written by GS
- VS/TES+XFB: output components that are not written by XFB
- VS/TES+XFB: slots that are not written by XFB (this could be significant)

This is also a cleanup because it unduplicates the bitcounts.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35351>
This commit is contained in:
Marek Olšák 2025-05-27 09:01:34 -04:00 committed by Marge Bot
parent 75b1602c14
commit 814990684d
4 changed files with 116 additions and 125 deletions

View file

@ -228,13 +228,19 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b,
nir_def *buffer_offsets_ret[4],
nir_def *emit_prim_ret[4]);
unsigned
ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component);
unsigned
ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component,
bool data_is_16bit);
void
ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
unsigned stream, nir_def *so_buffer[4],
nir_def *buffer_offsets[4],
unsigned vertex_index, nir_def *vtx_lds_addr,
ac_nir_prerast_out *pr_out,
bool skip_primitive_id);
ac_nir_prerast_out *pr_out);
void
ac_nir_repack_invocations_in_workgroup(nir_builder *b, nir_def **input_bool,

View file

@ -70,7 +70,6 @@ typedef struct
bool early_prim_export;
bool streamout_enabled;
bool has_user_edgeflags;
bool skip_primitive_id;
unsigned max_num_waves;
/* LDS params */
@ -205,18 +204,12 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *s, nir_def *arg)
unsigned edge_flag_bits = ac_get_all_edge_flag_bits(s->options->hw_info->gfx_level);
nir_def *mask = nir_imm_intN_t(b, ~edge_flag_bits, 32);
unsigned edge_flag_offset = 0;
if (s->streamout_enabled) {
unsigned packed_location =
util_bitcount64(b->shader->info.outputs_written &
BITFIELD64_MASK(VARYING_SLOT_EDGE));
edge_flag_offset = packed_location * 16;
}
for (int i = 0; i < s->options->num_vertices_per_primitive; i++) {
nir_def *vtx_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]);
nir_def *addr = pervertex_lds_addr(b, vtx_idx, s->pervertex_lds_bytes);
nir_def *edge = nir_load_shared(b, 1, 32, addr, .base = edge_flag_offset);
/* Edge flags share LDS with XFB. */
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_EDGE, 0, false);
nir_def *edge = nir_load_shared(b, 1, 32, addr, .base = offset);
if (s->options->hw_info->gfx_level >= GFX12)
mask = nir_ior(b, mask, nir_ishl_imm(b, edge, 8 + i * 9));
@ -1310,18 +1303,12 @@ ngg_nogs_store_edgeflag_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
nir_def *edgeflag = s->out.outputs[VARYING_SLOT_EDGE][0];
edgeflag = nir_umin(b, edgeflag, nir_imm_int(b, 1));
/* user edge flag is stored at the beginning of a vertex if streamout is not enabled */
unsigned offset = 0;
if (s->streamout_enabled) {
unsigned packed_location =
util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(VARYING_SLOT_EDGE));
offset = packed_location * 16;
}
nir_def *tid = nir_load_local_invocation_index(b);
nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
nir_store_shared(b, edgeflag, addr, .base = offset);
/* Edge flags share LDS with XFB. */
nir_store_shared(b, edgeflag, addr,
.base = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_EDGE, 0, false));
}
static void
@ -1357,12 +1344,6 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
u_foreach_bit64(slot, xfb_outputs) {
uint64_t outputs_written = b->shader->info.outputs_written;
if (s->skip_primitive_id)
outputs_written &= ~VARYING_BIT_PRIMITIVE_ID;
unsigned packed_location =
util_bitcount64(outputs_written & BITFIELD64_MASK(slot));
unsigned mask = xfb_mask[slot];
/* Clear unused components. */
@ -1381,15 +1362,13 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
* OpenGL puts 16bit outputs in VARYING_SLOT_VAR0_16BIT.
*/
nir_def *store_val = nir_vec(b, &s->out.outputs[slot][start], (unsigned)count);
nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, slot, start,
store_val->bit_size == 16);
nir_store_shared(b, store_val, addr, .base = offset, .align_mul = 4);
}
}
unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
u_foreach_bit64(slot, xfb_outputs_16bit) {
unsigned packed_location = num_32bit_outputs +
util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot));
unsigned mask_lo = xfb_mask_16bit_lo[slot];
unsigned mask_hi = xfb_mask_16bit_hi[slot];
@ -1420,7 +1399,9 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
}
nir_def *store_val = nir_vec(b, values, (unsigned)count);
nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_VAR0_16BIT + slot,
start, true);
nir_store_shared(b, store_val, addr, .base = offset);
}
}
}
@ -1448,7 +1429,6 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s)
/* Write out primitive data */
nir_if *if_emit = nir_push_if(b, nir_ilt(b, tid_in_tg, emit_prim_per_stream[0]));
{
unsigned vtx_lds_stride = (b->shader->num_outputs * 4 + 1) * 4;
nir_def *num_vert_per_prim = nir_load_num_vertices_per_primitive_amd(b);
nir_def *first_vertex_idx = nir_imul(b, tid_in_tg, num_vert_per_prim);
@ -1463,9 +1443,9 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s)
nir_push_if(b, nir_igt_imm(b, num_vert_per_prim, i));
{
nir_def *vtx_lds_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]);
nir_def *vtx_lds_addr = pervertex_lds_addr(b, vtx_lds_idx, vtx_lds_stride);
nir_def *vtx_lds_addr = pervertex_lds_addr(b, vtx_lds_idx, s->pervertex_lds_bytes);
ac_nir_ngg_build_streamout_vertex(b, info, 0, so_buffer, buffer_offsets, i,
vtx_lds_addr, &s->out, s->skip_primitive_id);
vtx_lds_addr, &s->out);
}
nir_pop_if(b, if_valid_vertex);
}
@ -1488,40 +1468,27 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s)
}
static unsigned
ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
unsigned shader_num_outputs,
ngg_nogs_get_pervertex_lds_size(lower_ngg_nogs_state *s,
gl_shader_stage stage,
bool streamout_enabled,
bool export_prim_id,
bool has_user_edgeflags)
{
unsigned pervertex_lds_bytes = 0;
if (streamout_enabled) {
/* The extra dword is used to avoid LDS bank conflicts and store the primitive id.
* TODO: only alloc space for outputs that really need streamout.
*/
pervertex_lds_bytes = (shader_num_outputs * 4 + 1) * 4;
}
bool need_prim_id_store_shared = export_prim_id && stage == MESA_SHADER_VERTEX;
if (need_prim_id_store_shared || has_user_edgeflags) {
unsigned size = 0;
if (need_prim_id_store_shared)
size += 4;
if (has_user_edgeflags)
size += 4;
unsigned xfb_size = streamout_enabled ? s->out.total_packed_xfb_lds_size : 0;
unsigned non_xfb_size = ((int)has_user_edgeflags + (int)need_prim_id_store_shared) * 4;
unsigned pervertex_lds_bytes = MAX2(xfb_size, non_xfb_size);
/* pad to odd dwords to avoid LDS bank conflict */
size |= 4;
pervertex_lds_bytes = MAX2(pervertex_lds_bytes, size);
}
/* Or 0x4 to make the size an odd number of dwords to reduce LDS bank conflicts. */
if (pervertex_lds_bytes)
pervertex_lds_bytes |= 0x4;
return pervertex_lds_bytes;
}
static void
ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nogs_state *s)
ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nogs_state *s,
bool gather_values)
{
/* Assume:
* - the shader used nir_lower_io_vars_to_temporaries
@ -1541,15 +1508,19 @@ ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nog
if (intrin->intrinsic != nir_intrinsic_store_output)
continue;
ac_nir_gather_prerast_store_output_info(b, intrin, &s->out, true);
nir_instr_remove(instr);
ac_nir_gather_prerast_store_output_info(b, intrin, &s->out, gather_values);
if (gather_values)
nir_instr_remove(instr);
}
}
if (!gather_values)
ac_nir_compute_prerast_packed_output_info(&s->out);
}
static unsigned
ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
unsigned shader_num_outputs,
ac_ngg_nogs_get_pervertex_lds_size(lower_ngg_nogs_state *s,
gl_shader_stage stage,
bool streamout_enabled,
bool export_prim_id,
bool has_user_edgeflags,
@ -1563,8 +1534,7 @@ ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
stage, uses_instance_id, uses_tess_primitive_id, NULL) : 0;
unsigned pervertex_lds_bytes =
ngg_nogs_get_pervertex_lds_size(stage, shader_num_outputs, streamout_enabled,
export_prim_id, has_user_edgeflags);
ngg_nogs_get_pervertex_lds_size(s, stage, streamout_enabled, export_prim_id, has_user_edgeflags);
return MAX2(culling_pervertex_lds_bytes, pervertex_lds_bytes);
}
@ -1617,7 +1587,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
.gs_exported_var = gs_exported_var,
.max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size),
.has_user_edgeflags = has_user_edgeflags,
.skip_primitive_id = streamout_enabled && (options->export_primitive_id || options->export_primitive_id_per_prim),
};
/* Can't export the primitive ID both as per-vertex and per-primitive. */
@ -1644,6 +1613,8 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
save_reusable_variables(b, &state);
}
ngg_nogs_gather_outputs(b, &impl->body, &state, false);
nir_cf_list *extracted = rzalloc(shader, nir_cf_list);
nir_cf_extract(extracted, nir_before_impl(impl),
nir_after_impl(impl));
@ -1706,8 +1677,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
/* determine the LDS vertex stride */
state.pervertex_lds_bytes =
ngg_nogs_get_pervertex_lds_size(shader->info.stage,
shader->num_outputs,
ngg_nogs_get_pervertex_lds_size(&state, shader->info.stage,
state.streamout_enabled,
options->export_primitive_id,
state.has_user_edgeflags);
@ -1748,7 +1718,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
nir_pop_if(b, if_es_thread);
/* Gather outputs data and types */
ngg_nogs_gather_outputs(b, &if_es_thread->then_list, &state);
ngg_nogs_gather_outputs(b, &if_es_thread->then_list, &state, true);
b->cursor = nir_after_cf_list(&if_es_thread->then_list);
/* This should be after streamout and before exports. */
@ -1882,7 +1852,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
} while (progress);
*out_lds_vertex_size =
ac_ngg_nogs_get_pervertex_lds_size(shader->info.stage, shader->num_outputs, state.streamout_enabled,
ac_ngg_nogs_get_pervertex_lds_size(&state, shader->info.stage, state.streamout_enabled,
options->export_primitive_id, state.has_user_edgeflags,
options->can_cull, state.deferred.uses_instance_id,
state.deferred.uses_tess_primitive_id);

View file

@ -163,7 +163,6 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
* In case of packed 16-bit, we assume that has been already packed into 32 bit slots by now.
*/
u_foreach_bit64(slot, b->shader->info.outputs_written) {
const unsigned packed_location = util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot)));
unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], stream);
nir_def **output = s->out.outputs[slot];
@ -185,7 +184,7 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
nir_def *store_val = nir_vec(b, values, (unsigned)count);
nir_store_shared(b, store_val, gs_emit_vtx_addr,
.base = packed_location * 16 + start * 4,
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, slot, start),
.align_mul = 4);
}
@ -193,13 +192,8 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
memset(s->out.outputs[slot], 0, sizeof(s->out.outputs[slot]));
}
const unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
/* Store dedicated 16-bit outputs to LDS. */
u_foreach_bit(slot, b->shader->info.outputs_written_16bit) {
const unsigned packed_location = num_32bit_outputs +
util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot));
const unsigned mask_lo = gs_output_component_mask_with_stream(s->out.infos_16bit_lo + slot, stream);
const unsigned mask_hi = gs_output_component_mask_with_stream(s->out.infos_16bit_hi + slot, stream);
unsigned mask = mask_lo | mask_hi;
@ -221,7 +215,7 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
nir_def *store_val = nir_vec(b, values, (unsigned)count);
nir_store_shared(b, store_val, gs_emit_vtx_addr,
.base = packed_location * 16 + start * 4,
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_VAR0_16BIT + slot, start),
.align_mul = 4);
}
@ -387,9 +381,6 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
}
u_foreach_bit64(slot, b->shader->info.outputs_written) {
const unsigned packed_location =
util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot)));
unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], 0);
while (mask) {
@ -397,7 +388,7 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
u_bit_scan_consecutive_range(&mask, &start, &count);
nir_def *load =
nir_load_shared(b, count, 32, exported_out_vtx_lds_addr,
.base = packed_location * 16 + start * 4,
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, slot, start),
.align_mul = 4);
for (int i = 0; i < count; i++)
@ -405,13 +396,8 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
}
}
const unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
/* Dedicated 16-bit outputs. */
u_foreach_bit(i, b->shader->info.outputs_written_16bit) {
const unsigned packed_location = num_32bit_outputs +
util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(i));
const unsigned mask_lo = gs_output_component_mask_with_stream(&s->out.infos_16bit_lo[i], 0);
const unsigned mask_hi = gs_output_component_mask_with_stream(&s->out.infos_16bit_hi[i], 0);
unsigned mask = mask_lo | mask_hi;
@ -421,7 +407,7 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
u_bit_scan_consecutive_range(&mask, &start, &count);
nir_def *load =
nir_load_shared(b, count, 32, exported_out_vtx_lds_addr,
.base = packed_location * 16 + start * 4,
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_VAR0_16BIT + i, start),
.align_mul = 4);
for (int j = 0; j < count; j++) {
@ -639,11 +625,12 @@ ngg_gs_cull_primitive(nir_builder *b, nir_def *tid_in_tg, nir_def *max_vtxcnt,
/* Load the positions from LDS. */
nir_def *pos[3][4];
for (unsigned i = 0; i < s->num_vertices_per_primitive; i++) {
/* VARYING_SLOT_POS == 0, so base won't count packed location */
pos[i][3] = nir_load_shared(b, 1, 32, vtxptr[i], .base = 12); /* W */
nir_def *xy = nir_load_shared(b, 2, 32, vtxptr[i], .base = 0, .align_mul = 4);
pos[i][0] = nir_channel(b, xy, 0);
pos[i][1] = nir_channel(b, xy, 1);
/* Load X, Y, W position components. */
for (unsigned c = 0; c < 4; c == 1 ? c += 2 : c++) {
pos[i][c] = nir_load_shared(b, 1, 32, vtxptr[i],
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_POS, c),
.align_mul = 4);
}
pos[i][0] = nir_fdiv(b, pos[i][0], pos[i][3]);
pos[i][1] = nir_fdiv(b, pos[i][1], pos[i][3]);
@ -806,7 +793,7 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
ac_nir_ngg_build_streamout_vertex(b, info, stream, so_buffer,
stream_buffer_offsets, i,
exported_vtx_lds_addr[i],
&s->out, false);
&s->out);
}
}
nir_pop_if(b, if_emit);
@ -903,16 +890,10 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options,
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
assert(impl);
/* Add 4 for primflags. */
*out_lds_vertex_size = (util_bitcount64(shader->info.outputs_written) +
util_bitcount(shader->info.outputs_written_16bit)) * 16 + 4;
lower_ngg_gs_state state = {
.options = options,
.impl = impl,
.max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size),
.lds_offs_primflags = *out_lds_vertex_size - 4,
.lds_bytes_per_gs_out_vertex = *out_lds_vertex_size,
.streamout_enabled = shader->xfb_info && !options->disable_streamout,
};
@ -962,6 +943,11 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options,
* gathers output values, which must be done while lowering GS intrinsics and not before.
*/
gather_output_stores(shader, &state);
ac_nir_compute_prerast_packed_output_info(&state.out);
state.lds_offs_primflags = state.out.total_packed_gs_out_size;
/* Make the vertex size in LDS be an odd number of dwords (| 0x4) to reduce LDS bank conflicts. */
state.lds_bytes_per_gs_out_vertex = (state.out.total_packed_gs_out_size + 4 /*primflags*/) | 0x4;
lower_ngg_gs_intrinsics(shader, &state);
if (state.streamout_enabled)
@ -995,5 +981,7 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options,
/* Cleanup */
nir_lower_vars_to_ssa(shader);
nir_remove_dead_variables(shader, nir_var_function_temp, NULL);
*out_lds_vertex_size = state.lds_bytes_per_gs_out_vertex;
return nir_progress(true, impl, nir_metadata_none);
}

View file

@ -1310,13 +1310,59 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b,
}
}
unsigned
ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component)
{
assert(component < 4);
unsigned lds_slot_offset, lds_component_mask;
if (slot >= VARYING_SLOT_VAR0_16BIT) {
unsigned i = slot - VARYING_SLOT_VAR0_16BIT;
assert(pr_out->infos_16bit_lo[i].packed_slot_gs_out_offset ==
pr_out->infos_16bit_hi[i].packed_slot_gs_out_offset);
lds_slot_offset = pr_out->infos_16bit_lo[i].packed_slot_gs_out_offset;
lds_component_mask = pr_out->infos_16bit_lo[i].components_mask |
pr_out->infos_16bit_hi[i].components_mask;
} else {
lds_slot_offset = pr_out->infos[slot].packed_slot_gs_out_offset;
lds_component_mask = pr_out->infos[slot].components_mask;
}
return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4;
}
unsigned
ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component,
bool data_is_16bit)
{
assert(component < 4);
unsigned lds_slot_offset = 0, lds_component_mask = 0;
if (slot >= VARYING_SLOT_VAR0_16BIT) {
unsigned i = slot - VARYING_SLOT_VAR0_16BIT;
assert(pr_out->infos_16bit_lo[i].packed_slot_xfb_lds_offset ==
pr_out->infos_16bit_hi[i].packed_slot_xfb_lds_offset);
lds_slot_offset = pr_out->infos_16bit_lo[i].packed_slot_xfb_lds_offset;
lds_component_mask = pr_out->infos_16bit_lo[i].xfb_lds_components_mask |
pr_out->infos_16bit_hi[i].xfb_lds_components_mask;
} else if (data_is_16bit) {
assert(!"unimplemented");
} else {
lds_slot_offset = pr_out->infos[slot].packed_slot_xfb_lds_offset;
lds_component_mask = pr_out->infos[slot].xfb_lds_components_mask;
}
return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4;
}
void
ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
unsigned stream, nir_def *so_buffer[4],
nir_def *buffer_offsets[4],
unsigned vertex_index, nir_def *vtx_lds_addr,
ac_nir_prerast_out *pr_out,
bool skip_primitive_id)
ac_nir_prerast_out *pr_out)
{
unsigned vertex_offset[NIR_MAX_XFB_BUFFERS] = {0};
@ -1337,32 +1383,13 @@ ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
if (!out->component_mask || info->buffer_to_stream[out->buffer] != stream)
continue;
unsigned base;
if (out->location >= VARYING_SLOT_VAR0_16BIT) {
base =
util_bitcount64(b->shader->info.outputs_written) +
util_bitcount(b->shader->info.outputs_written_16bit &
BITFIELD_MASK(out->location - VARYING_SLOT_VAR0_16BIT));
} else {
uint64_t outputs_written = b->shader->info.outputs_written;
if (skip_primitive_id)
outputs_written &= ~VARYING_BIT_PRIMITIVE_ID;
base =
util_bitcount64(outputs_written &
BITFIELD64_MASK(out->location));
}
unsigned offset = (base * 4 + out->component_offset) * 4;
unsigned count = util_bitcount(out->component_mask);
assert(u_bit_consecutive(out->component_offset, count) == out->component_mask);
nir_def *out_data =
nir_load_shared(b, count, 32, vtx_lds_addr, .base = offset);
for (unsigned comp = 0; comp < count; comp++) {
nir_def *data = nir_channel(b, out_data, comp);
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(pr_out, out->location,
out->component_offset + comp,
out->data_is_16bit);
nir_def *data = nir_load_shared(b, 1, 32, vtx_lds_addr, .base = offset, .align_mul = 4);
/* Convert 16-bit outputs to 32-bit.
*