diff --git a/src/amd/common/nir/ac_nir_helpers.h b/src/amd/common/nir/ac_nir_helpers.h index cada6d02bf7..4634deb4722 100644 --- a/src/amd/common/nir/ac_nir_helpers.h +++ b/src/amd/common/nir/ac_nir_helpers.h @@ -228,13 +228,19 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b, nir_def *buffer_offsets_ret[4], nir_def *emit_prim_ret[4]); +unsigned +ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component); + +unsigned +ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component, + bool data_is_16bit); + void ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info, unsigned stream, nir_def *so_buffer[4], nir_def *buffer_offsets[4], unsigned vertex_index, nir_def *vtx_lds_addr, - ac_nir_prerast_out *pr_out, - bool skip_primitive_id); + ac_nir_prerast_out *pr_out); void ac_nir_repack_invocations_in_workgroup(nir_builder *b, nir_def **input_bool, diff --git a/src/amd/common/nir/ac_nir_lower_ngg.c b/src/amd/common/nir/ac_nir_lower_ngg.c index 3e7827c17d3..7e812d9dfdf 100644 --- a/src/amd/common/nir/ac_nir_lower_ngg.c +++ b/src/amd/common/nir/ac_nir_lower_ngg.c @@ -70,7 +70,6 @@ typedef struct bool early_prim_export; bool streamout_enabled; bool has_user_edgeflags; - bool skip_primitive_id; unsigned max_num_waves; /* LDS params */ @@ -205,18 +204,12 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *s, nir_def *arg) unsigned edge_flag_bits = ac_get_all_edge_flag_bits(s->options->hw_info->gfx_level); nir_def *mask = nir_imm_intN_t(b, ~edge_flag_bits, 32); - unsigned edge_flag_offset = 0; - if (s->streamout_enabled) { - unsigned packed_location = - util_bitcount64(b->shader->info.outputs_written & - BITFIELD64_MASK(VARYING_SLOT_EDGE)); - edge_flag_offset = packed_location * 16; - } - for (int i = 0; i < s->options->num_vertices_per_primitive; i++) { nir_def *vtx_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]); nir_def *addr = pervertex_lds_addr(b, vtx_idx, s->pervertex_lds_bytes); - nir_def *edge = nir_load_shared(b, 1, 32, addr, .base = edge_flag_offset); + /* Edge flags share LDS with XFB. */ + unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_EDGE, 0, false); + nir_def *edge = nir_load_shared(b, 1, 32, addr, .base = offset); if (s->options->hw_info->gfx_level >= GFX12) mask = nir_ior(b, mask, nir_ishl_imm(b, edge, 8 + i * 9)); @@ -1310,18 +1303,12 @@ ngg_nogs_store_edgeflag_to_lds(nir_builder *b, lower_ngg_nogs_state *s) nir_def *edgeflag = s->out.outputs[VARYING_SLOT_EDGE][0]; edgeflag = nir_umin(b, edgeflag, nir_imm_int(b, 1)); - /* user edge flag is stored at the beginning of a vertex if streamout is not enabled */ - unsigned offset = 0; - if (s->streamout_enabled) { - unsigned packed_location = - util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(VARYING_SLOT_EDGE)); - offset = packed_location * 16; - } - nir_def *tid = nir_load_local_invocation_index(b); nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes); - nir_store_shared(b, edgeflag, addr, .base = offset); + /* Edge flags share LDS with XFB. */ + nir_store_shared(b, edgeflag, addr, + .base = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_EDGE, 0, false)); } static void @@ -1357,12 +1344,6 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s) nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes); u_foreach_bit64(slot, xfb_outputs) { - uint64_t outputs_written = b->shader->info.outputs_written; - if (s->skip_primitive_id) - outputs_written &= ~VARYING_BIT_PRIMITIVE_ID; - unsigned packed_location = - util_bitcount64(outputs_written & BITFIELD64_MASK(slot)); - unsigned mask = xfb_mask[slot]; /* Clear unused components. */ @@ -1381,15 +1362,13 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s) * OpenGL puts 16bit outputs in VARYING_SLOT_VAR0_16BIT. */ nir_def *store_val = nir_vec(b, &s->out.outputs[slot][start], (unsigned)count); - nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4); + unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, slot, start, + store_val->bit_size == 16); + nir_store_shared(b, store_val, addr, .base = offset, .align_mul = 4); } } - unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written); u_foreach_bit64(slot, xfb_outputs_16bit) { - unsigned packed_location = num_32bit_outputs + - util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot)); - unsigned mask_lo = xfb_mask_16bit_lo[slot]; unsigned mask_hi = xfb_mask_16bit_hi[slot]; @@ -1420,7 +1399,9 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s) } nir_def *store_val = nir_vec(b, values, (unsigned)count); - nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4); + unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_VAR0_16BIT + slot, + start, true); + nir_store_shared(b, store_val, addr, .base = offset); } } } @@ -1448,7 +1429,6 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s) /* Write out primitive data */ nir_if *if_emit = nir_push_if(b, nir_ilt(b, tid_in_tg, emit_prim_per_stream[0])); { - unsigned vtx_lds_stride = (b->shader->num_outputs * 4 + 1) * 4; nir_def *num_vert_per_prim = nir_load_num_vertices_per_primitive_amd(b); nir_def *first_vertex_idx = nir_imul(b, tid_in_tg, num_vert_per_prim); @@ -1463,9 +1443,9 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s) nir_push_if(b, nir_igt_imm(b, num_vert_per_prim, i)); { nir_def *vtx_lds_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]); - nir_def *vtx_lds_addr = pervertex_lds_addr(b, vtx_lds_idx, vtx_lds_stride); + nir_def *vtx_lds_addr = pervertex_lds_addr(b, vtx_lds_idx, s->pervertex_lds_bytes); ac_nir_ngg_build_streamout_vertex(b, info, 0, so_buffer, buffer_offsets, i, - vtx_lds_addr, &s->out, s->skip_primitive_id); + vtx_lds_addr, &s->out); } nir_pop_if(b, if_valid_vertex); } @@ -1488,40 +1468,27 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s) } static unsigned -ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage, - unsigned shader_num_outputs, +ngg_nogs_get_pervertex_lds_size(lower_ngg_nogs_state *s, + gl_shader_stage stage, bool streamout_enabled, bool export_prim_id, bool has_user_edgeflags) { - unsigned pervertex_lds_bytes = 0; - - if (streamout_enabled) { - /* The extra dword is used to avoid LDS bank conflicts and store the primitive id. - * TODO: only alloc space for outputs that really need streamout. - */ - pervertex_lds_bytes = (shader_num_outputs * 4 + 1) * 4; - } - bool need_prim_id_store_shared = export_prim_id && stage == MESA_SHADER_VERTEX; - if (need_prim_id_store_shared || has_user_edgeflags) { - unsigned size = 0; - if (need_prim_id_store_shared) - size += 4; - if (has_user_edgeflags) - size += 4; + unsigned xfb_size = streamout_enabled ? s->out.total_packed_xfb_lds_size : 0; + unsigned non_xfb_size = ((int)has_user_edgeflags + (int)need_prim_id_store_shared) * 4; + unsigned pervertex_lds_bytes = MAX2(xfb_size, non_xfb_size); - /* pad to odd dwords to avoid LDS bank conflict */ - size |= 4; - - pervertex_lds_bytes = MAX2(pervertex_lds_bytes, size); - } + /* Or 0x4 to make the size an odd number of dwords to reduce LDS bank conflicts. */ + if (pervertex_lds_bytes) + pervertex_lds_bytes |= 0x4; return pervertex_lds_bytes; } static void -ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nogs_state *s) +ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nogs_state *s, + bool gather_values) { /* Assume: * - the shader used nir_lower_io_vars_to_temporaries @@ -1541,15 +1508,19 @@ ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nog if (intrin->intrinsic != nir_intrinsic_store_output) continue; - ac_nir_gather_prerast_store_output_info(b, intrin, &s->out, true); - nir_instr_remove(instr); + ac_nir_gather_prerast_store_output_info(b, intrin, &s->out, gather_values); + if (gather_values) + nir_instr_remove(instr); } } + + if (!gather_values) + ac_nir_compute_prerast_packed_output_info(&s->out); } static unsigned -ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage, - unsigned shader_num_outputs, +ac_ngg_nogs_get_pervertex_lds_size(lower_ngg_nogs_state *s, + gl_shader_stage stage, bool streamout_enabled, bool export_prim_id, bool has_user_edgeflags, @@ -1563,8 +1534,7 @@ ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage, stage, uses_instance_id, uses_tess_primitive_id, NULL) : 0; unsigned pervertex_lds_bytes = - ngg_nogs_get_pervertex_lds_size(stage, shader_num_outputs, streamout_enabled, - export_prim_id, has_user_edgeflags); + ngg_nogs_get_pervertex_lds_size(s, stage, streamout_enabled, export_prim_id, has_user_edgeflags); return MAX2(culling_pervertex_lds_bytes, pervertex_lds_bytes); } @@ -1617,7 +1587,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option .gs_exported_var = gs_exported_var, .max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size), .has_user_edgeflags = has_user_edgeflags, - .skip_primitive_id = streamout_enabled && (options->export_primitive_id || options->export_primitive_id_per_prim), }; /* Can't export the primitive ID both as per-vertex and per-primitive. */ @@ -1644,6 +1613,8 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option save_reusable_variables(b, &state); } + ngg_nogs_gather_outputs(b, &impl->body, &state, false); + nir_cf_list *extracted = rzalloc(shader, nir_cf_list); nir_cf_extract(extracted, nir_before_impl(impl), nir_after_impl(impl)); @@ -1706,8 +1677,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option /* determine the LDS vertex stride */ state.pervertex_lds_bytes = - ngg_nogs_get_pervertex_lds_size(shader->info.stage, - shader->num_outputs, + ngg_nogs_get_pervertex_lds_size(&state, shader->info.stage, state.streamout_enabled, options->export_primitive_id, state.has_user_edgeflags); @@ -1748,7 +1718,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option nir_pop_if(b, if_es_thread); /* Gather outputs data and types */ - ngg_nogs_gather_outputs(b, &if_es_thread->then_list, &state); + ngg_nogs_gather_outputs(b, &if_es_thread->then_list, &state, true); b->cursor = nir_after_cf_list(&if_es_thread->then_list); /* This should be after streamout and before exports. */ @@ -1882,7 +1852,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option } while (progress); *out_lds_vertex_size = - ac_ngg_nogs_get_pervertex_lds_size(shader->info.stage, shader->num_outputs, state.streamout_enabled, + ac_ngg_nogs_get_pervertex_lds_size(&state, shader->info.stage, state.streamout_enabled, options->export_primitive_id, state.has_user_edgeflags, options->can_cull, state.deferred.uses_instance_id, state.deferred.uses_tess_primitive_id); diff --git a/src/amd/common/nir/ac_nir_lower_ngg_gs.c b/src/amd/common/nir/ac_nir_lower_ngg_gs.c index 0a4e8fcb575..c0ce2a1fb3f 100644 --- a/src/amd/common/nir/ac_nir_lower_ngg_gs.c +++ b/src/amd/common/nir/ac_nir_lower_ngg_gs.c @@ -163,7 +163,6 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri * In case of packed 16-bit, we assume that has been already packed into 32 bit slots by now. */ u_foreach_bit64(slot, b->shader->info.outputs_written) { - const unsigned packed_location = util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot))); unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], stream); nir_def **output = s->out.outputs[slot]; @@ -185,7 +184,7 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri nir_def *store_val = nir_vec(b, values, (unsigned)count); nir_store_shared(b, store_val, gs_emit_vtx_addr, - .base = packed_location * 16 + start * 4, + .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, slot, start), .align_mul = 4); } @@ -193,13 +192,8 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri memset(s->out.outputs[slot], 0, sizeof(s->out.outputs[slot])); } - const unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written); - /* Store dedicated 16-bit outputs to LDS. */ u_foreach_bit(slot, b->shader->info.outputs_written_16bit) { - const unsigned packed_location = num_32bit_outputs + - util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot)); - const unsigned mask_lo = gs_output_component_mask_with_stream(s->out.infos_16bit_lo + slot, stream); const unsigned mask_hi = gs_output_component_mask_with_stream(s->out.infos_16bit_hi + slot, stream); unsigned mask = mask_lo | mask_hi; @@ -221,7 +215,7 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri nir_def *store_val = nir_vec(b, values, (unsigned)count); nir_store_shared(b, store_val, gs_emit_vtx_addr, - .base = packed_location * 16 + start * 4, + .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_VAR0_16BIT + slot, start), .align_mul = 4); } @@ -387,9 +381,6 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g } u_foreach_bit64(slot, b->shader->info.outputs_written) { - const unsigned packed_location = - util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot))); - unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], 0); while (mask) { @@ -397,7 +388,7 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g u_bit_scan_consecutive_range(&mask, &start, &count); nir_def *load = nir_load_shared(b, count, 32, exported_out_vtx_lds_addr, - .base = packed_location * 16 + start * 4, + .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, slot, start), .align_mul = 4); for (int i = 0; i < count; i++) @@ -405,13 +396,8 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g } } - const unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written); - /* Dedicated 16-bit outputs. */ u_foreach_bit(i, b->shader->info.outputs_written_16bit) { - const unsigned packed_location = num_32bit_outputs + - util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(i)); - const unsigned mask_lo = gs_output_component_mask_with_stream(&s->out.infos_16bit_lo[i], 0); const unsigned mask_hi = gs_output_component_mask_with_stream(&s->out.infos_16bit_hi[i], 0); unsigned mask = mask_lo | mask_hi; @@ -421,7 +407,7 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g u_bit_scan_consecutive_range(&mask, &start, &count); nir_def *load = nir_load_shared(b, count, 32, exported_out_vtx_lds_addr, - .base = packed_location * 16 + start * 4, + .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_VAR0_16BIT + i, start), .align_mul = 4); for (int j = 0; j < count; j++) { @@ -639,11 +625,12 @@ ngg_gs_cull_primitive(nir_builder *b, nir_def *tid_in_tg, nir_def *max_vtxcnt, /* Load the positions from LDS. */ nir_def *pos[3][4]; for (unsigned i = 0; i < s->num_vertices_per_primitive; i++) { - /* VARYING_SLOT_POS == 0, so base won't count packed location */ - pos[i][3] = nir_load_shared(b, 1, 32, vtxptr[i], .base = 12); /* W */ - nir_def *xy = nir_load_shared(b, 2, 32, vtxptr[i], .base = 0, .align_mul = 4); - pos[i][0] = nir_channel(b, xy, 0); - pos[i][1] = nir_channel(b, xy, 1); + /* Load X, Y, W position components. */ + for (unsigned c = 0; c < 4; c == 1 ? c += 2 : c++) { + pos[i][c] = nir_load_shared(b, 1, 32, vtxptr[i], + .base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_POS, c), + .align_mul = 4); + } pos[i][0] = nir_fdiv(b, pos[i][0], pos[i][3]); pos[i][1] = nir_fdiv(b, pos[i][1], pos[i][3]); @@ -806,7 +793,7 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s) ac_nir_ngg_build_streamout_vertex(b, info, stream, so_buffer, stream_buffer_offsets, i, exported_vtx_lds_addr[i], - &s->out, false); + &s->out); } } nir_pop_if(b, if_emit); @@ -903,16 +890,10 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options, nir_function_impl *impl = nir_shader_get_entrypoint(shader); assert(impl); - /* Add 4 for primflags. */ - *out_lds_vertex_size = (util_bitcount64(shader->info.outputs_written) + - util_bitcount(shader->info.outputs_written_16bit)) * 16 + 4; - lower_ngg_gs_state state = { .options = options, .impl = impl, .max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size), - .lds_offs_primflags = *out_lds_vertex_size - 4, - .lds_bytes_per_gs_out_vertex = *out_lds_vertex_size, .streamout_enabled = shader->xfb_info && !options->disable_streamout, }; @@ -962,6 +943,11 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options, * gathers output values, which must be done while lowering GS intrinsics and not before. */ gather_output_stores(shader, &state); + ac_nir_compute_prerast_packed_output_info(&state.out); + state.lds_offs_primflags = state.out.total_packed_gs_out_size; + /* Make the vertex size in LDS be an odd number of dwords (| 0x4) to reduce LDS bank conflicts. */ + state.lds_bytes_per_gs_out_vertex = (state.out.total_packed_gs_out_size + 4 /*primflags*/) | 0x4; + lower_ngg_gs_intrinsics(shader, &state); if (state.streamout_enabled) @@ -995,5 +981,7 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options, /* Cleanup */ nir_lower_vars_to_ssa(shader); nir_remove_dead_variables(shader, nir_var_function_temp, NULL); + + *out_lds_vertex_size = state.lds_bytes_per_gs_out_vertex; return nir_progress(true, impl, nir_metadata_none); } diff --git a/src/amd/common/nir/ac_nir_prerast_utils.c b/src/amd/common/nir/ac_nir_prerast_utils.c index efb39f36cfb..36fe23d275e 100644 --- a/src/amd/common/nir/ac_nir_prerast_utils.c +++ b/src/amd/common/nir/ac_nir_prerast_utils.c @@ -1310,13 +1310,59 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b, } } +unsigned +ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component) +{ + assert(component < 4); + unsigned lds_slot_offset, lds_component_mask; + + if (slot >= VARYING_SLOT_VAR0_16BIT) { + unsigned i = slot - VARYING_SLOT_VAR0_16BIT; + assert(pr_out->infos_16bit_lo[i].packed_slot_gs_out_offset == + pr_out->infos_16bit_hi[i].packed_slot_gs_out_offset); + + lds_slot_offset = pr_out->infos_16bit_lo[i].packed_slot_gs_out_offset; + lds_component_mask = pr_out->infos_16bit_lo[i].components_mask | + pr_out->infos_16bit_hi[i].components_mask; + } else { + lds_slot_offset = pr_out->infos[slot].packed_slot_gs_out_offset; + lds_component_mask = pr_out->infos[slot].components_mask; + } + + return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4; +} + +unsigned +ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component, + bool data_is_16bit) +{ + assert(component < 4); + unsigned lds_slot_offset = 0, lds_component_mask = 0; + + if (slot >= VARYING_SLOT_VAR0_16BIT) { + unsigned i = slot - VARYING_SLOT_VAR0_16BIT; + assert(pr_out->infos_16bit_lo[i].packed_slot_xfb_lds_offset == + pr_out->infos_16bit_hi[i].packed_slot_xfb_lds_offset); + + lds_slot_offset = pr_out->infos_16bit_lo[i].packed_slot_xfb_lds_offset; + lds_component_mask = pr_out->infos_16bit_lo[i].xfb_lds_components_mask | + pr_out->infos_16bit_hi[i].xfb_lds_components_mask; + } else if (data_is_16bit) { + assert(!"unimplemented"); + } else { + lds_slot_offset = pr_out->infos[slot].packed_slot_xfb_lds_offset; + lds_component_mask = pr_out->infos[slot].xfb_lds_components_mask; + } + + return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4; +} + void ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info, unsigned stream, nir_def *so_buffer[4], nir_def *buffer_offsets[4], unsigned vertex_index, nir_def *vtx_lds_addr, - ac_nir_prerast_out *pr_out, - bool skip_primitive_id) + ac_nir_prerast_out *pr_out) { unsigned vertex_offset[NIR_MAX_XFB_BUFFERS] = {0}; @@ -1337,32 +1383,13 @@ ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info, if (!out->component_mask || info->buffer_to_stream[out->buffer] != stream) continue; - unsigned base; - if (out->location >= VARYING_SLOT_VAR0_16BIT) { - base = - util_bitcount64(b->shader->info.outputs_written) + - util_bitcount(b->shader->info.outputs_written_16bit & - BITFIELD_MASK(out->location - VARYING_SLOT_VAR0_16BIT)); - } else { - uint64_t outputs_written = b->shader->info.outputs_written; - if (skip_primitive_id) - outputs_written &= ~VARYING_BIT_PRIMITIVE_ID; - - base = - util_bitcount64(outputs_written & - BITFIELD64_MASK(out->location)); - } - - unsigned offset = (base * 4 + out->component_offset) * 4; unsigned count = util_bitcount(out->component_mask); - assert(u_bit_consecutive(out->component_offset, count) == out->component_mask); - - nir_def *out_data = - nir_load_shared(b, count, 32, vtx_lds_addr, .base = offset); - for (unsigned comp = 0; comp < count; comp++) { - nir_def *data = nir_channel(b, out_data, comp); + unsigned offset = ac_nir_ngg_get_xfb_lds_offset(pr_out, out->location, + out->component_offset + comp, + out->data_is_16bit); + nir_def *data = nir_load_shared(b, 1, 32, vtx_lds_addr, .base = offset, .align_mul = 4); /* Convert 16-bit outputs to 32-bit. *