mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 02:48:06 +02:00
ac/nir/ngg: Fix scratch space for NGG GS streamout
For GS streamout, we need the following LDS scratch space: - Repacking streamout vertices takes 1 dword per 4 waves per stream (max 16 bytes for Wave64, max 32 bytes for Wave32) - 1 dword per stream for buffer info (16 bytes) - 1 dword per buffer for buffer info (16 bytes) Previously, the space used for buffer info aliased with the space for repacking the output vertices in ngg_gs_finale(), and there was no barrier in between, which caused a race condition, resulting in random failure. Fix this by allocating a few more LDS dwords so that aliasing is not required, which also allows us to remove an extra workgroup barrier. Cc: mesa-stable Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12705 Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Marek Olšák <marek.olsak@amd.com> Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38364>
This commit is contained in:
parent
13148afd0e
commit
8f99d736d0
2 changed files with 26 additions and 12 deletions
|
|
@ -1817,10 +1817,25 @@ ac_ngg_get_scratch_lds_size(mesa_shader_stage stage,
|
|||
} else {
|
||||
assert(stage == MESA_SHADER_GEOMETRY);
|
||||
|
||||
/* Repacking output vertices at the end in ngg_gs_finale() uses 1 dword per 4 waves */
|
||||
scratch_lds_size = ALIGN(max_num_waves, 4u);
|
||||
/* streamout take 8 dwords for buffer offset and emit vertex per stream */
|
||||
if (streamout_enabled)
|
||||
scratch_lds_size = MAX2(scratch_lds_size, 32);
|
||||
|
||||
/* For streamout:
|
||||
* - Repacking streamout vertices takes 1 dword per 4 waves per stream
|
||||
* (max 16 bytes for Wave64, 32 bytes for Wave32)
|
||||
* - 1 dword per stream for buffer info
|
||||
* (16 bytes)
|
||||
* - 1 dword per buffer for buffer info
|
||||
* (16 bytes)
|
||||
*/
|
||||
if (streamout_enabled) {
|
||||
const unsigned num_streams = 4;
|
||||
const unsigned num_so_buffers = 4;
|
||||
const unsigned streamout_scratch_size =
|
||||
num_streams * ALIGN(max_num_waves, 4u) + num_streams * 4 + num_so_buffers * 4;
|
||||
|
||||
scratch_lds_size += streamout_scratch_size;
|
||||
}
|
||||
}
|
||||
|
||||
return scratch_lds_size;
|
||||
|
|
|
|||
|
|
@ -660,6 +660,10 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
|
|||
nir_def *export_seq[4] = {0};
|
||||
nir_def *out_vtx_primflag[4] = {0};
|
||||
|
||||
const unsigned scratch_stride = ALIGN(s->max_num_waves, 4);
|
||||
const unsigned scratch_base_off = scratch_stride;
|
||||
const unsigned num_streams = util_bitcount(info->streams_written);
|
||||
|
||||
u_foreach_bit(stream, info->streams_written) {
|
||||
out_vtx_primflag[stream] =
|
||||
ngg_gs_load_out_vtx_primflag(b, stream, tid_in_tg, out_vtx_lds_addr, max_vtxcnt, s);
|
||||
|
|
@ -669,9 +673,8 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
|
|||
*/
|
||||
prim_live[stream] = nir_i2b(b, nir_iand_imm(b, out_vtx_primflag[stream], 1));
|
||||
|
||||
unsigned scratch_stride = ALIGN(s->max_num_waves, 4);
|
||||
nir_def *scratch_base =
|
||||
nir_iadd_imm(b, s->lds_addr_gs_out_vtx, stream * scratch_stride);
|
||||
nir_iadd_imm(b, s->lds_addr_gs_out_vtx, stream * scratch_stride + scratch_base_off);
|
||||
|
||||
/* We want to export primitives to streamout buffer in sequence,
|
||||
* but not all vertices are alive or mark end of a primitive, so
|
||||
|
|
@ -697,18 +700,14 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
|
|||
export_seq[stream] = rep.repacked_invocation_index;
|
||||
}
|
||||
|
||||
/* Workgroup barrier: wait for LDS scratch reads finish. */
|
||||
nir_barrier(b, .execution_scope = SCOPE_WORKGROUP,
|
||||
.memory_scope = SCOPE_WORKGROUP,
|
||||
.memory_semantics = NIR_MEMORY_ACQ_REL,
|
||||
.memory_modes = nir_var_mem_shared);
|
||||
|
||||
/* Get global buffer offset where this workgroup will stream out data to. */
|
||||
nir_def *emit_prim[4] = {0};
|
||||
nir_def *buffer_offsets[4] = {0};
|
||||
nir_def *so_buffer[4] = {0};
|
||||
nir_def *buffer_info_scratch_base =
|
||||
nir_iadd_imm_nuw(b, s->lds_addr_gs_out_vtx, num_streams * scratch_stride + scratch_base_off);
|
||||
ac_nir_ngg_build_streamout_buffer_info(b, info, s->options->hw_info->gfx_level, s->options->has_xfb_prim_query,
|
||||
s->options->use_gfx12_xfb_intrinsic, s->lds_addr_gs_out_vtx, tid_in_tg,
|
||||
s->options->use_gfx12_xfb_intrinsic, buffer_info_scratch_base, tid_in_tg,
|
||||
gen_prim, so_buffer, buffer_offsets, emit_prim);
|
||||
|
||||
u_foreach_bit(stream, info->streams_written) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue