mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-22 10:40:22 +01:00
ac/nir/lower_ngg: pack GS outputs and XFB outputs in LDS optimally
This switches the code to the new slot offsets from ac_nir_prerast_out instead of using a prefix bitmask over outputs_written. The LDS layout no longer includes these: - GS: output components that are not written by GS - VS/TES+XFB: output components that are not written by XFB - VS/TES+XFB: slots that are not written by XFB (this could be significant) This is also a cleanup because it unduplicates the bitcounts. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35351>
This commit is contained in:
parent
75b1602c14
commit
814990684d
4 changed files with 116 additions and 125 deletions
|
|
@ -228,13 +228,19 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b,
|
|||
nir_def *buffer_offsets_ret[4],
|
||||
nir_def *emit_prim_ret[4]);
|
||||
|
||||
unsigned
|
||||
ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component);
|
||||
|
||||
unsigned
|
||||
ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component,
|
||||
bool data_is_16bit);
|
||||
|
||||
void
|
||||
ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
|
||||
unsigned stream, nir_def *so_buffer[4],
|
||||
nir_def *buffer_offsets[4],
|
||||
unsigned vertex_index, nir_def *vtx_lds_addr,
|
||||
ac_nir_prerast_out *pr_out,
|
||||
bool skip_primitive_id);
|
||||
ac_nir_prerast_out *pr_out);
|
||||
|
||||
void
|
||||
ac_nir_repack_invocations_in_workgroup(nir_builder *b, nir_def **input_bool,
|
||||
|
|
|
|||
|
|
@ -70,7 +70,6 @@ typedef struct
|
|||
bool early_prim_export;
|
||||
bool streamout_enabled;
|
||||
bool has_user_edgeflags;
|
||||
bool skip_primitive_id;
|
||||
unsigned max_num_waves;
|
||||
|
||||
/* LDS params */
|
||||
|
|
@ -205,18 +204,12 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *s, nir_def *arg)
|
|||
unsigned edge_flag_bits = ac_get_all_edge_flag_bits(s->options->hw_info->gfx_level);
|
||||
nir_def *mask = nir_imm_intN_t(b, ~edge_flag_bits, 32);
|
||||
|
||||
unsigned edge_flag_offset = 0;
|
||||
if (s->streamout_enabled) {
|
||||
unsigned packed_location =
|
||||
util_bitcount64(b->shader->info.outputs_written &
|
||||
BITFIELD64_MASK(VARYING_SLOT_EDGE));
|
||||
edge_flag_offset = packed_location * 16;
|
||||
}
|
||||
|
||||
for (int i = 0; i < s->options->num_vertices_per_primitive; i++) {
|
||||
nir_def *vtx_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]);
|
||||
nir_def *addr = pervertex_lds_addr(b, vtx_idx, s->pervertex_lds_bytes);
|
||||
nir_def *edge = nir_load_shared(b, 1, 32, addr, .base = edge_flag_offset);
|
||||
/* Edge flags share LDS with XFB. */
|
||||
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_EDGE, 0, false);
|
||||
nir_def *edge = nir_load_shared(b, 1, 32, addr, .base = offset);
|
||||
|
||||
if (s->options->hw_info->gfx_level >= GFX12)
|
||||
mask = nir_ior(b, mask, nir_ishl_imm(b, edge, 8 + i * 9));
|
||||
|
|
@ -1310,18 +1303,12 @@ ngg_nogs_store_edgeflag_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
|
|||
nir_def *edgeflag = s->out.outputs[VARYING_SLOT_EDGE][0];
|
||||
edgeflag = nir_umin(b, edgeflag, nir_imm_int(b, 1));
|
||||
|
||||
/* user edge flag is stored at the beginning of a vertex if streamout is not enabled */
|
||||
unsigned offset = 0;
|
||||
if (s->streamout_enabled) {
|
||||
unsigned packed_location =
|
||||
util_bitcount64(b->shader->info.outputs_written & BITFIELD64_MASK(VARYING_SLOT_EDGE));
|
||||
offset = packed_location * 16;
|
||||
}
|
||||
|
||||
nir_def *tid = nir_load_local_invocation_index(b);
|
||||
nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
|
||||
|
||||
nir_store_shared(b, edgeflag, addr, .base = offset);
|
||||
/* Edge flags share LDS with XFB. */
|
||||
nir_store_shared(b, edgeflag, addr,
|
||||
.base = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_EDGE, 0, false));
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
@ -1357,12 +1344,6 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
|
|||
nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
|
||||
|
||||
u_foreach_bit64(slot, xfb_outputs) {
|
||||
uint64_t outputs_written = b->shader->info.outputs_written;
|
||||
if (s->skip_primitive_id)
|
||||
outputs_written &= ~VARYING_BIT_PRIMITIVE_ID;
|
||||
unsigned packed_location =
|
||||
util_bitcount64(outputs_written & BITFIELD64_MASK(slot));
|
||||
|
||||
unsigned mask = xfb_mask[slot];
|
||||
|
||||
/* Clear unused components. */
|
||||
|
|
@ -1381,15 +1362,13 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
|
|||
* OpenGL puts 16bit outputs in VARYING_SLOT_VAR0_16BIT.
|
||||
*/
|
||||
nir_def *store_val = nir_vec(b, &s->out.outputs[slot][start], (unsigned)count);
|
||||
nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
|
||||
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, slot, start,
|
||||
store_val->bit_size == 16);
|
||||
nir_store_shared(b, store_val, addr, .base = offset, .align_mul = 4);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
|
||||
u_foreach_bit64(slot, xfb_outputs_16bit) {
|
||||
unsigned packed_location = num_32bit_outputs +
|
||||
util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot));
|
||||
|
||||
unsigned mask_lo = xfb_mask_16bit_lo[slot];
|
||||
unsigned mask_hi = xfb_mask_16bit_hi[slot];
|
||||
|
||||
|
|
@ -1420,7 +1399,9 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
|
|||
}
|
||||
|
||||
nir_def *store_val = nir_vec(b, values, (unsigned)count);
|
||||
nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4);
|
||||
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_VAR0_16BIT + slot,
|
||||
start, true);
|
||||
nir_store_shared(b, store_val, addr, .base = offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1448,7 +1429,6 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s)
|
|||
/* Write out primitive data */
|
||||
nir_if *if_emit = nir_push_if(b, nir_ilt(b, tid_in_tg, emit_prim_per_stream[0]));
|
||||
{
|
||||
unsigned vtx_lds_stride = (b->shader->num_outputs * 4 + 1) * 4;
|
||||
nir_def *num_vert_per_prim = nir_load_num_vertices_per_primitive_amd(b);
|
||||
nir_def *first_vertex_idx = nir_imul(b, tid_in_tg, num_vert_per_prim);
|
||||
|
||||
|
|
@ -1463,9 +1443,9 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s)
|
|||
nir_push_if(b, nir_igt_imm(b, num_vert_per_prim, i));
|
||||
{
|
||||
nir_def *vtx_lds_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]);
|
||||
nir_def *vtx_lds_addr = pervertex_lds_addr(b, vtx_lds_idx, vtx_lds_stride);
|
||||
nir_def *vtx_lds_addr = pervertex_lds_addr(b, vtx_lds_idx, s->pervertex_lds_bytes);
|
||||
ac_nir_ngg_build_streamout_vertex(b, info, 0, so_buffer, buffer_offsets, i,
|
||||
vtx_lds_addr, &s->out, s->skip_primitive_id);
|
||||
vtx_lds_addr, &s->out);
|
||||
}
|
||||
nir_pop_if(b, if_valid_vertex);
|
||||
}
|
||||
|
|
@ -1488,40 +1468,27 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s)
|
|||
}
|
||||
|
||||
static unsigned
|
||||
ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
|
||||
unsigned shader_num_outputs,
|
||||
ngg_nogs_get_pervertex_lds_size(lower_ngg_nogs_state *s,
|
||||
gl_shader_stage stage,
|
||||
bool streamout_enabled,
|
||||
bool export_prim_id,
|
||||
bool has_user_edgeflags)
|
||||
{
|
||||
unsigned pervertex_lds_bytes = 0;
|
||||
|
||||
if (streamout_enabled) {
|
||||
/* The extra dword is used to avoid LDS bank conflicts and store the primitive id.
|
||||
* TODO: only alloc space for outputs that really need streamout.
|
||||
*/
|
||||
pervertex_lds_bytes = (shader_num_outputs * 4 + 1) * 4;
|
||||
}
|
||||
|
||||
bool need_prim_id_store_shared = export_prim_id && stage == MESA_SHADER_VERTEX;
|
||||
if (need_prim_id_store_shared || has_user_edgeflags) {
|
||||
unsigned size = 0;
|
||||
if (need_prim_id_store_shared)
|
||||
size += 4;
|
||||
if (has_user_edgeflags)
|
||||
size += 4;
|
||||
unsigned xfb_size = streamout_enabled ? s->out.total_packed_xfb_lds_size : 0;
|
||||
unsigned non_xfb_size = ((int)has_user_edgeflags + (int)need_prim_id_store_shared) * 4;
|
||||
unsigned pervertex_lds_bytes = MAX2(xfb_size, non_xfb_size);
|
||||
|
||||
/* pad to odd dwords to avoid LDS bank conflict */
|
||||
size |= 4;
|
||||
|
||||
pervertex_lds_bytes = MAX2(pervertex_lds_bytes, size);
|
||||
}
|
||||
/* Or 0x4 to make the size an odd number of dwords to reduce LDS bank conflicts. */
|
||||
if (pervertex_lds_bytes)
|
||||
pervertex_lds_bytes |= 0x4;
|
||||
|
||||
return pervertex_lds_bytes;
|
||||
}
|
||||
|
||||
static void
|
||||
ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nogs_state *s)
|
||||
ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nogs_state *s,
|
||||
bool gather_values)
|
||||
{
|
||||
/* Assume:
|
||||
* - the shader used nir_lower_io_vars_to_temporaries
|
||||
|
|
@ -1541,15 +1508,19 @@ ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nog
|
|||
if (intrin->intrinsic != nir_intrinsic_store_output)
|
||||
continue;
|
||||
|
||||
ac_nir_gather_prerast_store_output_info(b, intrin, &s->out, true);
|
||||
nir_instr_remove(instr);
|
||||
ac_nir_gather_prerast_store_output_info(b, intrin, &s->out, gather_values);
|
||||
if (gather_values)
|
||||
nir_instr_remove(instr);
|
||||
}
|
||||
}
|
||||
|
||||
if (!gather_values)
|
||||
ac_nir_compute_prerast_packed_output_info(&s->out);
|
||||
}
|
||||
|
||||
static unsigned
|
||||
ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
|
||||
unsigned shader_num_outputs,
|
||||
ac_ngg_nogs_get_pervertex_lds_size(lower_ngg_nogs_state *s,
|
||||
gl_shader_stage stage,
|
||||
bool streamout_enabled,
|
||||
bool export_prim_id,
|
||||
bool has_user_edgeflags,
|
||||
|
|
@ -1563,8 +1534,7 @@ ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
|
|||
stage, uses_instance_id, uses_tess_primitive_id, NULL) : 0;
|
||||
|
||||
unsigned pervertex_lds_bytes =
|
||||
ngg_nogs_get_pervertex_lds_size(stage, shader_num_outputs, streamout_enabled,
|
||||
export_prim_id, has_user_edgeflags);
|
||||
ngg_nogs_get_pervertex_lds_size(s, stage, streamout_enabled, export_prim_id, has_user_edgeflags);
|
||||
|
||||
return MAX2(culling_pervertex_lds_bytes, pervertex_lds_bytes);
|
||||
}
|
||||
|
|
@ -1617,7 +1587,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
|
|||
.gs_exported_var = gs_exported_var,
|
||||
.max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size),
|
||||
.has_user_edgeflags = has_user_edgeflags,
|
||||
.skip_primitive_id = streamout_enabled && (options->export_primitive_id || options->export_primitive_id_per_prim),
|
||||
};
|
||||
|
||||
/* Can't export the primitive ID both as per-vertex and per-primitive. */
|
||||
|
|
@ -1644,6 +1613,8 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
|
|||
save_reusable_variables(b, &state);
|
||||
}
|
||||
|
||||
ngg_nogs_gather_outputs(b, &impl->body, &state, false);
|
||||
|
||||
nir_cf_list *extracted = rzalloc(shader, nir_cf_list);
|
||||
nir_cf_extract(extracted, nir_before_impl(impl),
|
||||
nir_after_impl(impl));
|
||||
|
|
@ -1706,8 +1677,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
|
|||
|
||||
/* determine the LDS vertex stride */
|
||||
state.pervertex_lds_bytes =
|
||||
ngg_nogs_get_pervertex_lds_size(shader->info.stage,
|
||||
shader->num_outputs,
|
||||
ngg_nogs_get_pervertex_lds_size(&state, shader->info.stage,
|
||||
state.streamout_enabled,
|
||||
options->export_primitive_id,
|
||||
state.has_user_edgeflags);
|
||||
|
|
@ -1748,7 +1718,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
|
|||
nir_pop_if(b, if_es_thread);
|
||||
|
||||
/* Gather outputs data and types */
|
||||
ngg_nogs_gather_outputs(b, &if_es_thread->then_list, &state);
|
||||
ngg_nogs_gather_outputs(b, &if_es_thread->then_list, &state, true);
|
||||
b->cursor = nir_after_cf_list(&if_es_thread->then_list);
|
||||
|
||||
/* This should be after streamout and before exports. */
|
||||
|
|
@ -1882,7 +1852,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
|
|||
} while (progress);
|
||||
|
||||
*out_lds_vertex_size =
|
||||
ac_ngg_nogs_get_pervertex_lds_size(shader->info.stage, shader->num_outputs, state.streamout_enabled,
|
||||
ac_ngg_nogs_get_pervertex_lds_size(&state, shader->info.stage, state.streamout_enabled,
|
||||
options->export_primitive_id, state.has_user_edgeflags,
|
||||
options->can_cull, state.deferred.uses_instance_id,
|
||||
state.deferred.uses_tess_primitive_id);
|
||||
|
|
|
|||
|
|
@ -163,7 +163,6 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
|
|||
* In case of packed 16-bit, we assume that has been already packed into 32 bit slots by now.
|
||||
*/
|
||||
u_foreach_bit64(slot, b->shader->info.outputs_written) {
|
||||
const unsigned packed_location = util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot)));
|
||||
unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], stream);
|
||||
|
||||
nir_def **output = s->out.outputs[slot];
|
||||
|
|
@ -185,7 +184,7 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
|
|||
|
||||
nir_def *store_val = nir_vec(b, values, (unsigned)count);
|
||||
nir_store_shared(b, store_val, gs_emit_vtx_addr,
|
||||
.base = packed_location * 16 + start * 4,
|
||||
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, slot, start),
|
||||
.align_mul = 4);
|
||||
}
|
||||
|
||||
|
|
@ -193,13 +192,8 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
|
|||
memset(s->out.outputs[slot], 0, sizeof(s->out.outputs[slot]));
|
||||
}
|
||||
|
||||
const unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
|
||||
|
||||
/* Store dedicated 16-bit outputs to LDS. */
|
||||
u_foreach_bit(slot, b->shader->info.outputs_written_16bit) {
|
||||
const unsigned packed_location = num_32bit_outputs +
|
||||
util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot));
|
||||
|
||||
const unsigned mask_lo = gs_output_component_mask_with_stream(s->out.infos_16bit_lo + slot, stream);
|
||||
const unsigned mask_hi = gs_output_component_mask_with_stream(s->out.infos_16bit_hi + slot, stream);
|
||||
unsigned mask = mask_lo | mask_hi;
|
||||
|
|
@ -221,7 +215,7 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
|
|||
|
||||
nir_def *store_val = nir_vec(b, values, (unsigned)count);
|
||||
nir_store_shared(b, store_val, gs_emit_vtx_addr,
|
||||
.base = packed_location * 16 + start * 4,
|
||||
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_VAR0_16BIT + slot, start),
|
||||
.align_mul = 4);
|
||||
}
|
||||
|
||||
|
|
@ -387,9 +381,6 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
|
|||
}
|
||||
|
||||
u_foreach_bit64(slot, b->shader->info.outputs_written) {
|
||||
const unsigned packed_location =
|
||||
util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot)));
|
||||
|
||||
unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], 0);
|
||||
|
||||
while (mask) {
|
||||
|
|
@ -397,7 +388,7 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
|
|||
u_bit_scan_consecutive_range(&mask, &start, &count);
|
||||
nir_def *load =
|
||||
nir_load_shared(b, count, 32, exported_out_vtx_lds_addr,
|
||||
.base = packed_location * 16 + start * 4,
|
||||
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, slot, start),
|
||||
.align_mul = 4);
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
|
|
@ -405,13 +396,8 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
|
|||
}
|
||||
}
|
||||
|
||||
const unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written);
|
||||
|
||||
/* Dedicated 16-bit outputs. */
|
||||
u_foreach_bit(i, b->shader->info.outputs_written_16bit) {
|
||||
const unsigned packed_location = num_32bit_outputs +
|
||||
util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(i));
|
||||
|
||||
const unsigned mask_lo = gs_output_component_mask_with_stream(&s->out.infos_16bit_lo[i], 0);
|
||||
const unsigned mask_hi = gs_output_component_mask_with_stream(&s->out.infos_16bit_hi[i], 0);
|
||||
unsigned mask = mask_lo | mask_hi;
|
||||
|
|
@ -421,7 +407,7 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
|
|||
u_bit_scan_consecutive_range(&mask, &start, &count);
|
||||
nir_def *load =
|
||||
nir_load_shared(b, count, 32, exported_out_vtx_lds_addr,
|
||||
.base = packed_location * 16 + start * 4,
|
||||
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_VAR0_16BIT + i, start),
|
||||
.align_mul = 4);
|
||||
|
||||
for (int j = 0; j < count; j++) {
|
||||
|
|
@ -639,11 +625,12 @@ ngg_gs_cull_primitive(nir_builder *b, nir_def *tid_in_tg, nir_def *max_vtxcnt,
|
|||
/* Load the positions from LDS. */
|
||||
nir_def *pos[3][4];
|
||||
for (unsigned i = 0; i < s->num_vertices_per_primitive; i++) {
|
||||
/* VARYING_SLOT_POS == 0, so base won't count packed location */
|
||||
pos[i][3] = nir_load_shared(b, 1, 32, vtxptr[i], .base = 12); /* W */
|
||||
nir_def *xy = nir_load_shared(b, 2, 32, vtxptr[i], .base = 0, .align_mul = 4);
|
||||
pos[i][0] = nir_channel(b, xy, 0);
|
||||
pos[i][1] = nir_channel(b, xy, 1);
|
||||
/* Load X, Y, W position components. */
|
||||
for (unsigned c = 0; c < 4; c == 1 ? c += 2 : c++) {
|
||||
pos[i][c] = nir_load_shared(b, 1, 32, vtxptr[i],
|
||||
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_POS, c),
|
||||
.align_mul = 4);
|
||||
}
|
||||
|
||||
pos[i][0] = nir_fdiv(b, pos[i][0], pos[i][3]);
|
||||
pos[i][1] = nir_fdiv(b, pos[i][1], pos[i][3]);
|
||||
|
|
@ -806,7 +793,7 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
|
|||
ac_nir_ngg_build_streamout_vertex(b, info, stream, so_buffer,
|
||||
stream_buffer_offsets, i,
|
||||
exported_vtx_lds_addr[i],
|
||||
&s->out, false);
|
||||
&s->out);
|
||||
}
|
||||
}
|
||||
nir_pop_if(b, if_emit);
|
||||
|
|
@ -903,16 +890,10 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options,
|
|||
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
||||
assert(impl);
|
||||
|
||||
/* Add 4 for primflags. */
|
||||
*out_lds_vertex_size = (util_bitcount64(shader->info.outputs_written) +
|
||||
util_bitcount(shader->info.outputs_written_16bit)) * 16 + 4;
|
||||
|
||||
lower_ngg_gs_state state = {
|
||||
.options = options,
|
||||
.impl = impl,
|
||||
.max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size),
|
||||
.lds_offs_primflags = *out_lds_vertex_size - 4,
|
||||
.lds_bytes_per_gs_out_vertex = *out_lds_vertex_size,
|
||||
.streamout_enabled = shader->xfb_info && !options->disable_streamout,
|
||||
};
|
||||
|
||||
|
|
@ -962,6 +943,11 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options,
|
|||
* gathers output values, which must be done while lowering GS intrinsics and not before.
|
||||
*/
|
||||
gather_output_stores(shader, &state);
|
||||
ac_nir_compute_prerast_packed_output_info(&state.out);
|
||||
state.lds_offs_primflags = state.out.total_packed_gs_out_size;
|
||||
/* Make the vertex size in LDS be an odd number of dwords (| 0x4) to reduce LDS bank conflicts. */
|
||||
state.lds_bytes_per_gs_out_vertex = (state.out.total_packed_gs_out_size + 4 /*primflags*/) | 0x4;
|
||||
|
||||
lower_ngg_gs_intrinsics(shader, &state);
|
||||
|
||||
if (state.streamout_enabled)
|
||||
|
|
@ -995,5 +981,7 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options,
|
|||
/* Cleanup */
|
||||
nir_lower_vars_to_ssa(shader);
|
||||
nir_remove_dead_variables(shader, nir_var_function_temp, NULL);
|
||||
|
||||
*out_lds_vertex_size = state.lds_bytes_per_gs_out_vertex;
|
||||
return nir_progress(true, impl, nir_metadata_none);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1310,13 +1310,59 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b,
|
|||
}
|
||||
}
|
||||
|
||||
unsigned
|
||||
ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component)
|
||||
{
|
||||
assert(component < 4);
|
||||
unsigned lds_slot_offset, lds_component_mask;
|
||||
|
||||
if (slot >= VARYING_SLOT_VAR0_16BIT) {
|
||||
unsigned i = slot - VARYING_SLOT_VAR0_16BIT;
|
||||
assert(pr_out->infos_16bit_lo[i].packed_slot_gs_out_offset ==
|
||||
pr_out->infos_16bit_hi[i].packed_slot_gs_out_offset);
|
||||
|
||||
lds_slot_offset = pr_out->infos_16bit_lo[i].packed_slot_gs_out_offset;
|
||||
lds_component_mask = pr_out->infos_16bit_lo[i].components_mask |
|
||||
pr_out->infos_16bit_hi[i].components_mask;
|
||||
} else {
|
||||
lds_slot_offset = pr_out->infos[slot].packed_slot_gs_out_offset;
|
||||
lds_component_mask = pr_out->infos[slot].components_mask;
|
||||
}
|
||||
|
||||
return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4;
|
||||
}
|
||||
|
||||
unsigned
|
||||
ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component,
|
||||
bool data_is_16bit)
|
||||
{
|
||||
assert(component < 4);
|
||||
unsigned lds_slot_offset = 0, lds_component_mask = 0;
|
||||
|
||||
if (slot >= VARYING_SLOT_VAR0_16BIT) {
|
||||
unsigned i = slot - VARYING_SLOT_VAR0_16BIT;
|
||||
assert(pr_out->infos_16bit_lo[i].packed_slot_xfb_lds_offset ==
|
||||
pr_out->infos_16bit_hi[i].packed_slot_xfb_lds_offset);
|
||||
|
||||
lds_slot_offset = pr_out->infos_16bit_lo[i].packed_slot_xfb_lds_offset;
|
||||
lds_component_mask = pr_out->infos_16bit_lo[i].xfb_lds_components_mask |
|
||||
pr_out->infos_16bit_hi[i].xfb_lds_components_mask;
|
||||
} else if (data_is_16bit) {
|
||||
assert(!"unimplemented");
|
||||
} else {
|
||||
lds_slot_offset = pr_out->infos[slot].packed_slot_xfb_lds_offset;
|
||||
lds_component_mask = pr_out->infos[slot].xfb_lds_components_mask;
|
||||
}
|
||||
|
||||
return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4;
|
||||
}
|
||||
|
||||
void
|
||||
ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
|
||||
unsigned stream, nir_def *so_buffer[4],
|
||||
nir_def *buffer_offsets[4],
|
||||
unsigned vertex_index, nir_def *vtx_lds_addr,
|
||||
ac_nir_prerast_out *pr_out,
|
||||
bool skip_primitive_id)
|
||||
ac_nir_prerast_out *pr_out)
|
||||
{
|
||||
unsigned vertex_offset[NIR_MAX_XFB_BUFFERS] = {0};
|
||||
|
||||
|
|
@ -1337,32 +1383,13 @@ ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
|
|||
if (!out->component_mask || info->buffer_to_stream[out->buffer] != stream)
|
||||
continue;
|
||||
|
||||
unsigned base;
|
||||
if (out->location >= VARYING_SLOT_VAR0_16BIT) {
|
||||
base =
|
||||
util_bitcount64(b->shader->info.outputs_written) +
|
||||
util_bitcount(b->shader->info.outputs_written_16bit &
|
||||
BITFIELD_MASK(out->location - VARYING_SLOT_VAR0_16BIT));
|
||||
} else {
|
||||
uint64_t outputs_written = b->shader->info.outputs_written;
|
||||
if (skip_primitive_id)
|
||||
outputs_written &= ~VARYING_BIT_PRIMITIVE_ID;
|
||||
|
||||
base =
|
||||
util_bitcount64(outputs_written &
|
||||
BITFIELD64_MASK(out->location));
|
||||
}
|
||||
|
||||
unsigned offset = (base * 4 + out->component_offset) * 4;
|
||||
unsigned count = util_bitcount(out->component_mask);
|
||||
|
||||
assert(u_bit_consecutive(out->component_offset, count) == out->component_mask);
|
||||
|
||||
nir_def *out_data =
|
||||
nir_load_shared(b, count, 32, vtx_lds_addr, .base = offset);
|
||||
|
||||
for (unsigned comp = 0; comp < count; comp++) {
|
||||
nir_def *data = nir_channel(b, out_data, comp);
|
||||
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(pr_out, out->location,
|
||||
out->component_offset + comp,
|
||||
out->data_is_16bit);
|
||||
nir_def *data = nir_load_shared(b, 1, 32, vtx_lds_addr, .base = offset, .align_mul = 4);
|
||||
|
||||
/* Convert 16-bit outputs to 32-bit.
|
||||
*
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue