diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 87935110c34..2a90a07f214 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1580,6 +1580,52 @@ void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir) shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh; } +static void si_nir_assign_param_offsets(nir_shader *nir, const struct si_shader_info *info, + int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS], + uint8_t *num_param_exports, uint64_t *output_param_mask, + uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS]) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + assert(impl); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; + + /* No indirect indexing allowed. */ + ASSERTED nir_src offset = *nir_get_io_offset_src(intr); + assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0); + + assert(intr->num_components == 1); /* only scalar stores expected */ + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + + /* Assign the param index if it's unassigned. */ + if (nir_slot_is_varying(sem.location) && !sem.no_varying && + (sem.gs_streams & 0x3) == 0 && + vs_output_param_offset[sem.location] == AC_EXP_PARAM_DEFAULT_VAL_0000) { + /* The semantic and the base should be the same as in si_shader_info. */ + assert(sem.location == info->output_semantic[nir_intrinsic_base(intr)]); + /* It must not be remapped (duplicated). */ + assert(slot_remap[sem.location] == -1); + + vs_output_param_offset[sem.location] = (*num_param_exports)++; + *output_param_mask |= BITFIELD64_BIT(nir_intrinsic_base(intr)); + } + } + } + + /* Duplicated outputs are redirected here. */ + for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) { + if (slot_remap[i] >= 0) + vs_output_param_offset[i] = vs_output_param_offset[slot_remap[i]]; + } +} + bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, struct si_shader *shader, struct util_debug_callback *debug) { @@ -1587,6 +1633,42 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi bool free_nir; struct nir_shader *nir = si_get_nir_shader(sel, &shader->key, &free_nir); + /* Assign param export indices. */ + if ((sel->stage == MESA_SHADER_VERTEX || + sel->stage == MESA_SHADER_TESS_EVAL || + (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) && + !shader->key.ge.as_ls && !shader->key.ge.as_es) { + /* Initialize this first. */ + shader->info.nr_param_exports = 0; + shader->info.vs_output_param_mask = 0; + + STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1); + memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000, + sizeof(shader->info.vs_output_param_offset)); + + /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be + * mapped to multiple fragment shader inputs. + */ + int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS]; + memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS); + + /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */ + /* TODO: This doesn't affect GS. */ + NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap, + shader->info.vs_output_param_offset); + + /* Assign the non-constant outputs. */ + /* TODO: Use this for the GS copy shader too. */ + si_nir_assign_param_offsets(nir, &sel->info, slot_remap, &shader->info.nr_param_exports, + &shader->info.vs_output_param_mask, + shader->info.vs_output_param_offset); + + if (shader->key.ge.mono.u.vs_export_prim_id) { + shader->info.vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = shader->info.nr_param_exports++; + shader->info.vs_output_param_mask |= BITFIELD64_BIT(sel->info.num_outputs); + } + } + struct pipe_stream_output_info so = {}; if (sel->info.enabled_streamout_buffer_mask) nir_gather_stream_output_info(nir, &so); @@ -1635,13 +1717,14 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset; + /* We must use the original shader info before the removal of duplicated shader outputs. */ /* VS and TES should also set primitive ID output if it's used. */ unsigned num_outputs_with_prim_id = sel->info.num_outputs + shader->key.ge.mono.u.vs_export_prim_id; for (unsigned i = 0; i < num_outputs_with_prim_id; i++) { unsigned semantic = sel->info.output_semantic[i]; - unsigned offset = vs_output_param_offset[i]; + unsigned offset = vs_output_param_offset[semantic]; unsigned ps_input_cntl; if (offset <= AC_EXP_PARAM_OFFSET_31) { diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 5f9e59391b2..98408fb508a 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -739,7 +739,8 @@ union si_shader_key { /* GCN-specific shader info. */ struct si_shader_binary_info { - ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; + ubyte vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS]; + uint64_t vs_output_param_mask; /* which params to export, indexed by "base" */ uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS]; ubyte num_input_sgprs; ubyte num_input_vgprs; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 5a4c76793d8..1c26e82842d 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -1061,31 +1061,6 @@ static bool si_should_optimize_less(struct ac_llvm_compiler *compiler, return sel->stage == MESA_SHADER_COMPUTE && sel->info.num_memory_stores > 1000; } -static void si_optimize_vs_outputs(struct si_shader_context *ctx) -{ - struct si_shader *shader = ctx->shader; - struct si_shader_info *info = &shader->selector->info; - unsigned skip_vs_optim_mask = 0; - - if ((ctx->stage != MESA_SHADER_VERTEX && ctx->stage != MESA_SHADER_TESS_EVAL) || - shader->key.ge.as_ls || shader->key.ge.as_es) - return; - - /* Optimizing these outputs is not possible, since they might be overriden - * at runtime with S_028644_PT_SPRITE_TEX. */ - for (int i = 0; i < info->num_outputs; i++) { - if (info->output_semantic[i] == VARYING_SLOT_PNTC || - (info->output_semantic[i] >= VARYING_SLOT_TEX0 && - info->output_semantic[i] <= VARYING_SLOT_TEX7)) { - skip_vs_optim_mask |= 1u << shader->info.vs_output_param_offset[i]; - } - } - - ac_optimize_vs_outputs(&ctx->ac, ctx->main_fn, shader->info.vs_output_param_offset, - info->num_outputs, skip_vs_optim_mask, - &shader->info.nr_param_exports); -} - bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, struct si_shader *shader, const struct pipe_stream_output_info *so, struct util_debug_callback *debug, struct nir_shader *nir, @@ -1295,9 +1270,6 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * si_llvm_optimize_module(&ctx); - /* Post-optimization transformations and analysis. */ - si_optimize_vs_outputs(&ctx); - /* Make sure the input is a pointer and not integer followed by inttoptr. */ assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index e20af7e1358..0bde0d99259 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -22,6 +22,7 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "ac_nir.h" #include "si_pipe.h" #include "si_shader_internal.h" #include "sid.h" @@ -444,6 +445,25 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, shader->is_gs_copy_shader = true; shader->wave_size = si_determine_wave_size(sscreen, shader); + STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1); + memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000, + sizeof(shader->info.vs_output_param_offset)); + + for (unsigned i = 0; i < gsinfo->num_outputs; i++) { + unsigned semantic = gsinfo->output_semantic[i]; + + /* Skip if no channel writes to stream 0. */ + if (!nir_slot_is_varying(semantic) || + (gsinfo->output_streams[i] & 0x03 && + gsinfo->output_streams[i] & 0x0c && + gsinfo->output_streams[i] & 0x30 && + gsinfo->output_streams[i] & 0xc0)) + continue; + + shader->info.vs_output_param_offset[semantic] = shader->info.nr_param_exports++; + shader->info.vs_output_param_mask |= BITFIELD64_BIT(i); + } + si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size); ctx.shader = shader; ctx.stage = MESA_SHADER_VERTEX; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index ab984f2f7fb..b54fc86ed2e 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -438,61 +438,6 @@ static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, const LLV memcpy(&args->out[0], values, sizeof(values[0]) * 4); } -static void si_prepare_param_exports(struct si_shader_context *ctx, - const struct si_shader_output_values *outputs, unsigned noutput, - struct ac_export_args exports[32]) -{ - struct si_shader *shader = ctx->shader; - unsigned param_count = 0; - - memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000, - sizeof(shader->info.vs_output_param_offset)); - - for (unsigned i = 0; i < noutput; i++) { - unsigned semantic = outputs[i].semantic; - - /* Skip if no channel writes to stream 0. */ - if (outputs[i].vertex_streams & 0x03 && - outputs[i].vertex_streams & 0x0c && - outputs[i].vertex_streams & 0x30 && - outputs[i].vertex_streams & 0xc0) - continue; - - switch (semantic) { - case VARYING_SLOT_LAYER: - case VARYING_SLOT_VIEWPORT: - case VARYING_SLOT_CLIP_DIST0: - case VARYING_SLOT_CLIP_DIST1: - case VARYING_SLOT_COL0: - case VARYING_SLOT_COL1: - case VARYING_SLOT_BFC0: - case VARYING_SLOT_BFC1: - case VARYING_SLOT_PRIMITIVE_ID: - case VARYING_SLOT_FOGC: - break; - default: - if ((semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7) || - semantic >= VARYING_SLOT_VAR0) - break; - else - continue; - } - - if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) && - shader->key.ge.opt.kill_outputs & - (1ull << si_shader_io_get_unique_index(semantic, true))) - continue; - - si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_PARAM + param_count, - &exports[param_count]); - - assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); - shader->info.vs_output_param_offset[i] = param_count++; - } - - shader->info.nr_param_exports = param_count; -} - /** * Vertex color clamping. * @@ -576,9 +521,6 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, si_vertex_color_clamping(ctx, outputs, noutput); - struct ac_export_args param_exports[32]; - si_prepare_param_exports(ctx, outputs, noutput, param_exports); - /* Build position exports. */ for (i = 0; i < noutput; i++) { switch (outputs[i].semantic) { @@ -747,7 +689,23 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, ac_build_export(&ctx->ac, &pos_args[i]); } - /* Build parameter exports. */ + /* Build parameter exports. Use 2 loops to export params in ascending order. + * 32 is the maximum number of parameter exports. + */ + struct ac_export_args param_exports[32] = {}; + uint64_t vs_output_param_mask = shader->info.vs_output_param_mask; + + while (vs_output_param_mask) { + unsigned i = u_bit_scan64(&vs_output_param_mask); + unsigned offset = shader->info.vs_output_param_offset[outputs[i].semantic]; + + assert(offset <= AC_EXP_PARAM_OFFSET_31); + assert(!param_exports[offset].enabled_channels); + + si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_PARAM + offset, + ¶m_exports[offset]); + } + for (unsigned i = 0; i < shader->info.nr_param_exports; i++) ac_build_export(&ctx->ac, ¶m_exports[i]); }