diff --git a/.pick_status.json b/.pick_status.json index 15af04eec76..e5520e2e169 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -754,7 +754,7 @@ "description": "ac/nir/ngg: Fix scratch space for NGG GS streamout", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/amd/common/nir/ac_nir_lower_ngg.c b/src/amd/common/nir/ac_nir_lower_ngg.c index 03eae2d8fb3..aa454239307 100644 --- a/src/amd/common/nir/ac_nir_lower_ngg.c +++ b/src/amd/common/nir/ac_nir_lower_ngg.c @@ -1817,10 +1817,25 @@ ac_ngg_get_scratch_lds_size(mesa_shader_stage stage, } else { assert(stage == MESA_SHADER_GEOMETRY); + /* Repacking output vertices at the end in ngg_gs_finale() uses 1 dword per 4 waves */ scratch_lds_size = ALIGN(max_num_waves, 4u); - /* streamout take 8 dwords for buffer offset and emit vertex per stream */ - if (streamout_enabled) - scratch_lds_size = MAX2(scratch_lds_size, 32); + + /* For streamout: + * - Repacking streamout vertices takes 1 dword per 4 waves per stream + * (max 16 bytes for Wave64, 32 bytes for Wave32) + * - 1 dword per stream for buffer info + * (16 bytes) + * - 1 dword per buffer for buffer info + * (16 bytes) + */ + if (streamout_enabled) { + const unsigned num_streams = 4; + const unsigned num_so_buffers = 4; + const unsigned streamout_scratch_size = + num_streams * ALIGN(max_num_waves, 4u) + num_streams * 4 + num_so_buffers * 4; + + scratch_lds_size += streamout_scratch_size; + } } return scratch_lds_size; diff --git a/src/amd/common/nir/ac_nir_lower_ngg_gs.c b/src/amd/common/nir/ac_nir_lower_ngg_gs.c index bc2765cbeac..abd2d7a5d81 100644 --- a/src/amd/common/nir/ac_nir_lower_ngg_gs.c +++ b/src/amd/common/nir/ac_nir_lower_ngg_gs.c @@ -660,6 +660,10 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s) nir_def *export_seq[4] = {0}; nir_def *out_vtx_primflag[4] = {0}; + const unsigned scratch_stride = ALIGN(s->max_num_waves, 4); + const unsigned scratch_base_off = scratch_stride; + const unsigned num_streams = util_bitcount(info->streams_written); + u_foreach_bit(stream, info->streams_written) { out_vtx_primflag[stream] = ngg_gs_load_out_vtx_primflag(b, stream, tid_in_tg, out_vtx_lds_addr, max_vtxcnt, s); @@ -669,9 +673,8 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s) */ prim_live[stream] = nir_i2b(b, nir_iand_imm(b, out_vtx_primflag[stream], 1)); - unsigned scratch_stride = ALIGN(s->max_num_waves, 4); nir_def *scratch_base = - nir_iadd_imm(b, s->lds_addr_gs_out_vtx, stream * scratch_stride); + nir_iadd_imm(b, s->lds_addr_gs_out_vtx, stream * scratch_stride + scratch_base_off); /* We want to export primitives to streamout buffer in sequence, * but not all vertices are alive or mark end of a primitive, so @@ -697,18 +700,14 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s) export_seq[stream] = rep.repacked_invocation_index; } - /* Workgroup barrier: wait for LDS scratch reads finish. */ - nir_barrier(b, .execution_scope = SCOPE_WORKGROUP, - .memory_scope = SCOPE_WORKGROUP, - .memory_semantics = NIR_MEMORY_ACQ_REL, - .memory_modes = nir_var_mem_shared); - /* Get global buffer offset where this workgroup will stream out data to. */ nir_def *emit_prim[4] = {0}; nir_def *buffer_offsets[4] = {0}; nir_def *so_buffer[4] = {0}; + nir_def *buffer_info_scratch_base = + nir_iadd_imm_nuw(b, s->lds_addr_gs_out_vtx, num_streams * scratch_stride + scratch_base_off); ac_nir_ngg_build_streamout_buffer_info(b, info, s->options->hw_info->gfx_level, s->options->has_xfb_prim_query, - s->options->use_gfx12_xfb_intrinsic, s->lds_addr_gs_out_vtx, tid_in_tg, + s->options->use_gfx12_xfb_intrinsic, buffer_info_scratch_base, tid_in_tg, gen_prim, so_buffer, buffer_offsets, emit_prim); u_foreach_bit(stream, info->streams_written) {