radeonsi: gather pipe_stream_output_info from NIR intrinsics

This stops pipe_stream_output_info from create_*s_state context functions
because NIR contains everything and can do more advanced shader linking
this way.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14414>
This commit is contained in:
Marek Olšák 2021-12-19 20:10:03 -05:00 committed by Marge Bot
parent 981bd8cbe2
commit b57a163b7d
8 changed files with 41 additions and 35 deletions

View file

@ -606,7 +606,7 @@ static unsigned ngg_nogs_vertex_size(struct si_shader *shader)
/* The edgeflag is always stored in the last element that's also
* used for padding to reduce LDS bank conflicts. */
if (shader->selector->so.num_outputs)
if (shader->selector->info.enabled_streamout_buffer_mask)
lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
if (gfx10_ngg_writes_user_edgeflags(shader))
lds_vertex_size = MAX2(lds_vertex_size, 1);
@ -2169,7 +2169,7 @@ unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader)
{
const struct si_shader_selector *sel = shader->selector;
if (sel->info.stage == MESA_SHADER_GEOMETRY && sel->so.num_outputs)
if (sel->info.stage == MESA_SHADER_GEOMETRY && sel->info.enabled_streamout_buffer_mask)
return 44;
return 8;

View file

@ -814,7 +814,7 @@ struct si_streamout {
/* External state which comes from the vertex shader,
* it must be set explicitly when binding a shader. */
uint16_t *stride_in_dw;
uint8_t *stride_in_dw;
unsigned enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */
/* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */

View file

@ -27,6 +27,7 @@
#include "nir.h"
#include "nir_builder.h"
#include "nir_serialize.h"
#include "nir/nir_helpers.h"
#include "si_pipe.h"
#include "si_shader_internal.h"
#include "sid.h"
@ -1587,7 +1588,9 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
bool free_nir;
struct nir_shader *nir = si_get_nir_shader(sel, &shader->key, &free_nir);
struct pipe_stream_output_info so = sel->so;
struct pipe_stream_output_info so = {};
if (sel->info.enabled_streamout_buffer_mask)
nir_gather_stream_output_info(nir, &so);
/* Dump NIR before doing NIR->LLVM conversion in case the
* conversion fails. */
@ -1616,7 +1619,7 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
/* The GS copy shader is compiled next. */
if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
shader->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
shader->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, &so, debug);
if (!shader->gs_copy_shader) {
fprintf(stderr, "radeonsi: can't create GS copy shader\n");
return false;
@ -2312,7 +2315,7 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
shader->uses_vs_state_outprim = sscreen->use_ngg &&
/* Only used by streamout in vertex shaders. */
sel->info.stage == MESA_SHADER_VERTEX &&
sel->so.num_outputs;
sel->info.enabled_streamout_buffer_mask;
if (sel->info.stage == MESA_SHADER_VERTEX) {
shader->uses_base_instance = sel->info.uses_base_instance ||

View file

@ -367,6 +367,7 @@ struct si_shader_info {
int constbuf0_num_slots;
ubyte num_stream_output_components[4];
uint16_t enabled_streamout_buffer_mask;
uint num_memory_stores;
@ -459,7 +460,6 @@ struct si_shader_selector {
void *nir_binary;
unsigned nir_size;
struct pipe_stream_output_info so;
struct si_shader_info info;
enum pipe_shader_type pipe_shader_type;
@ -486,7 +486,6 @@ struct si_shader_selector {
uint16_t gsvs_vertex_size;
ubyte gs_input_verts_per_prim;
unsigned max_gsvs_emit_size;
uint16_t enabled_streamout_buffer_mask;
bool tess_turns_off_ngg;
/* PS parameters. */
@ -959,6 +958,7 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
struct ac_llvm_compiler *compiler,
struct si_shader_selector *gs_selector,
const struct pipe_stream_output_info *so,
struct util_debug_callback *debug);
/* si_shader_nir.c */

View file

@ -325,6 +325,7 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr
(nir_intrinsic_component(intr) * 2);
unsigned new_mask = mask & ~info->output_usagemask[loc];
/* Iterate over all components. */
for (unsigned i = 0; i < 4; i++) {
unsigned stream = (gs_streams >> (i * 2)) & 0x3;
@ -332,6 +333,16 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr
info->output_streams[loc] |= stream << (i * 2);
info->num_stream_output_components[stream]++;
}
if (nir_intrinsic_has_io_xfb(intr)) {
nir_io_xfb xfb = i < 2 ? nir_intrinsic_io_xfb(intr) :
nir_intrinsic_io_xfb2(intr);
if (xfb.out[i % 2].num_components) {
unsigned stream = (gs_streams >> (i * 2)) & 0x3;
info->enabled_streamout_buffer_mask |=
BITFIELD_BIT(stream * 4 + xfb.out[i % 2].buffer);
}
}
}
if (nir_intrinsic_has_src_type(intr))

View file

@ -422,6 +422,7 @@ void si_preload_gs_rings(struct si_shader_context *ctx)
struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
struct ac_llvm_compiler *compiler,
struct si_shader_selector *gs_selector,
const struct pipe_stream_output_info *so,
struct util_debug_callback *debug)
{
struct si_shader_context ctx;
@ -446,7 +447,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size);
ctx.shader = shader;
ctx.stage = MESA_SHADER_VERTEX;
ctx.so = gs_selector->so;
ctx.so = *so;
builder = ctx.ac.builder;

View file

@ -203,9 +203,6 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
_mesa_sha1_init(&ctx);
_mesa_sha1_update(&ctx, &shader_variant_flags, 4);
_mesa_sha1_update(&ctx, ir_binary, ir_size);
if (sel->info.stage == MESA_SHADER_VERTEX || sel->info.stage == MESA_SHADER_TESS_EVAL ||
sel->info.stage == MESA_SHADER_GEOMETRY)
_mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so));
_mesa_sha1_final(&ctx, ir_sha1_cache_key);
if (ir_binary == blob.data)
@ -1512,7 +1509,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
}
shader->ctx_reg.ngg.vgt_stages.u.ngg = 1;
shader->ctx_reg.ngg.vgt_stages.u.streamout = gs_sel->so.num_outputs;
shader->ctx_reg.ngg.vgt_stages.u.streamout = !!gs_sel->info.enabled_streamout_buffer_mask;
shader->ctx_reg.ngg.vgt_stages.u.ngg_passthrough = gfx10_is_ngg_passthrough(shader);
shader->ctx_reg.ngg.vgt_stages.u.gs_wave32 = shader->wave_size == 32;
}
@ -1702,11 +1699,11 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8);
if (!sscreen->use_ngg_streamout) {
rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) |
S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
S_00B12C_SO_EN(!!shader->selector->so.num_outputs);
rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->info.base.xfb_stride[0]) |
S_00B12C_SO_BASE1_EN(!!shader->selector->info.base.xfb_stride[1]) |
S_00B12C_SO_BASE2_EN(!!shader->selector->info.base.xfb_stride[2]) |
S_00B12C_SO_BASE3_EN(!!shader->selector->info.base.xfb_stride[3]) |
S_00B12C_SO_EN(!!info->enabled_streamout_buffer_mask);
}
si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
@ -2783,7 +2780,7 @@ int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state
}
}
static void si_parse_next_shader_property(const struct si_shader_info *info, bool streamout,
static void si_parse_next_shader_property(const struct si_shader_info *info,
union si_shader_key *key)
{
gl_shader_stage next_shader = info->base.next_stage;
@ -2804,7 +2801,7 @@ static void si_parse_next_shader_property(const struct si_shader_info *info, boo
* assume that it's a HW LS. (the next shader is TCS)
* This heuristic is needed for separate shader objects.
*/
if (!info->writes_position && !streamout)
if (!info->writes_position && !info->enabled_streamout_buffer_mask)
key->ge.as_ls = 1;
}
break;
@ -2874,10 +2871,11 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind
shader->selector = sel;
shader->is_monolithic = false;
si_parse_next_shader_property(&sel->info, sel->so.num_outputs != 0, &shader->key);
si_parse_next_shader_property(&sel->info, &shader->key);
if (sel->info.stage <= MESA_SHADER_GEOMETRY &&
sscreen->use_ngg && (!sel->so.num_outputs || sscreen->use_ngg_streamout) &&
sscreen->use_ngg && (!sel->info.enabled_streamout_buffer_mask ||
sscreen->use_ngg_streamout) &&
((sel->info.stage == MESA_SHADER_VERTEX && !shader->key.ge.as_ls) ||
sel->info.stage == MESA_SHADER_TESS_EVAL || sel->info.stage == MESA_SHADER_GEOMETRY))
shader->key.ge.as_ngg = 1;
@ -3035,8 +3033,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
sel->compiler_ctx_state.debug = sctx->debug;
sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
sel->so = state->stream_output;
if (state->type == PIPE_SHADER_IR_TGSI) {
sel->nir = tgsi_to_nir(state->tokens, ctx->screen, true);
} else {
@ -3057,12 +3053,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers,
&sel->active_samplers_and_images);
/* Record which streamout buffers are enabled. */
for (unsigned i = 0; i < sel->so.num_outputs; i++) {
sel->enabled_streamout_buffer_mask |= (1 << sel->so.output[i].output_buffer)
<< (sel->so.output[i].stream * 4);
}
sel->num_vs_inputs =
sel->info.stage == MESA_SHADER_VERTEX && !sel->info.base.vs.blit_sgprs_amd
? sel->info.num_inputs
@ -3197,7 +3187,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
!sel->info.writes_viewport_index && /* cull only against viewport 0 */
!sel->info.base.writes_memory &&
/* NGG GS supports culling with streamout because it culls after streamout. */
(sel->info.stage == MESA_SHADER_GEOMETRY || !sel->so.num_outputs) &&
(sel->info.stage == MESA_SHADER_GEOMETRY || !sel->info.enabled_streamout_buffer_mask) &&
(sel->info.stage != MESA_SHADER_GEOMETRY || sel->info.num_stream_output_components[0]) &&
(sel->info.stage != MESA_SHADER_VERTEX ||
(!sel->info.base.vs.blit_sgprs_amd &&
@ -3312,8 +3302,8 @@ static void si_update_streamout_state(struct si_context *sctx)
if (!shader_with_so)
return;
sctx->streamout.enabled_stream_buffers_mask = shader_with_so->enabled_streamout_buffer_mask;
sctx->streamout.stride_in_dw = shader_with_so->so.stride;
sctx->streamout.enabled_stream_buffers_mask = shader_with_so->info.enabled_streamout_buffer_mask;
sctx->streamout.stride_in_dw = shader_with_so->info.base.xfb_stride;
}
static void si_update_clip_regs(struct si_context *sctx, struct si_shader_selector *old_hw_vs,
@ -3440,7 +3430,8 @@ bool si_update_ngg(struct si_context *sctx)
} else if (!sctx->screen->use_ngg_streamout) {
struct si_shader_selector *last = si_get_vs(sctx)->cso;
if ((last && last->so.num_outputs) || sctx->streamout.prims_gen_query_enabled)
if ((last && last->info.enabled_streamout_buffer_mask) ||
sctx->streamout.prims_gen_query_enabled)
new_ngg = false;
}

View file

@ -308,7 +308,7 @@ static void si_emit_streamout_begin(struct si_context *sctx)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
struct si_streamout_target **t = sctx->streamout.targets;
uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
uint8_t *stride_in_dw = sctx->streamout.stride_in_dw;
unsigned i;
si_flush_vgt_streamout(sctx);