diff --git a/docs/envvars.rst b/docs/envvars.rst index d3ad9b5985c..dbaaa84b9dc 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -2139,6 +2139,9 @@ PowerVR driver environment variables ``no_pred_cf`` No predicated execution in CF. + ``alloc_extra_vtxins`` + Allocates additional vertex input registers. + .. envvar:: PCO_SKIP_PASSES A comma-separated list of passes to skip. diff --git a/src/imagination/.clang-format b/src/imagination/.clang-format index 30ed663d997..c918aaaef41 100644 --- a/src/imagination/.clang-format +++ b/src/imagination/.clang-format @@ -294,6 +294,7 @@ ForEachMacros: [ 'pco_foreach_instr_src_ssa_from', 'pco_foreach_instr_src_vreg', 'pco_foreach_instr_src_vreg_ssa', + 'pco_foreach_instr_src_vtxin_reg', 'pco_foreach_loop_in_func', 'pco_foreach_loop_in_func_from', 'pco_foreach_loop_in_func_from_rev', diff --git a/src/imagination/include/hwdef/rogue_hw_utils.h b/src/imagination/include/hwdef/rogue_hw_utils.h index 6324cd56c42..a859059e9b6 100644 --- a/src/imagination/include/hwdef/rogue_hw_utils.h +++ b/src/imagination/include/hwdef/rogue_hw_utils.h @@ -478,6 +478,12 @@ rogue_max_wg_temps(const struct pvr_device_info *dev_info, return temps; } +static inline uint32_t rogue_get_vtxins(void) +{ + /* TODO: use highest safe number of vertex input registers. */ + return 12; +} + static inline uint32_t rogue_num_uscs_per_tile(const struct pvr_device_info *dev_info) { diff --git a/src/imagination/pco/pco.c b/src/imagination/pco/pco.c index 42c76bd9db3..4274442b362 100644 --- a/src/imagination/pco/pco.c +++ b/src/imagination/pco/pco.c @@ -370,6 +370,7 @@ struct pvr_stats pco_get_pvr_stats(pco_shader *shader) .scratch_size = shader->data.common.scratch, .spill_count = shader->data.common.spilled_temps, .temp_count = shader->data.common.temps, + .vtxin_count = shader->data.common.vtxins, .loop_count = loop_count, .inst_group_count = igrp_count, .main_inst_group_count = main_count, diff --git a/src/imagination/pco/pco_debug.c b/src/imagination/pco/pco_debug.c index 54f5c254e35..c6d48c56cfe 100644 --- a/src/imagination/pco/pco_debug.c +++ b/src/imagination/pco/pco_debug.c @@ -26,6 +26,9 @@ static const struct debug_named_value pco_debug_options[] = { { "val_skip", PCO_DEBUG_VAL_SKIP, "Skip IR validation." }, { "reindex", PCO_DEBUG_REINDEX, "Reindex IR at the end of each pass." }, { "no_pred_cf", PCO_DEBUG_NO_PRED_CF, "No predicated execution in CF." }, + { "alloc_extra_vtxins", + PCO_DEBUG_ALLOC_EXTRA_VTXINS, + "Allocates additional vertex input registers." }, DEBUG_NAMED_VALUE_END, }; diff --git a/src/imagination/pco/pco_internal.h b/src/imagination/pco/pco_internal.h index fb900c3bb14..49987c4f573 100644 --- a/src/imagination/pco/pco_internal.h +++ b/src/imagination/pco/pco_internal.h @@ -57,6 +57,7 @@ enum pco_debug { PCO_DEBUG_VAL_SKIP = BITFIELD64_BIT(0), PCO_DEBUG_REINDEX = BITFIELD64_BIT(1), PCO_DEBUG_NO_PRED_CF = BITFIELD64_BIT(2), + PCO_DEBUG_ALLOC_EXTRA_VTXINS = BITFIELD64_BIT(3), }; extern uint64_t pco_debug; @@ -353,6 +354,7 @@ typedef struct _pco_func { unsigned next_loop; /** Next loop index. */ unsigned temps; /** Number of temps allocated. */ + unsigned vtxins; /** Number of vertex input registers used. */ pco_ref emc; /** Execution mask counter register. */ @@ -731,6 +733,10 @@ PCO_DEFINE_CAST(pco_cf_node_as_func, pco_foreach_instr_src (psrc, instr) \ if (pco_ref_is_hwreg(*psrc)) +#define pco_foreach_instr_src_vtxin_reg(psrc, instr) \ + pco_foreach_instr_src (psrc, instr) \ + if (pco_ref_is_vtxin(*psrc)) + #define pco_cf_node_head(list) list_first_entry(list, pco_cf_node, link) #define pco_cf_node_tail(list) list_last_entry(list, pco_cf_node, link) @@ -1985,6 +1991,17 @@ static inline bool pco_ref_is_scalar(pco_ref ref) return !ref.chans; } +/** + * \brief Return whether a reference is a vertex input register. + * + * \param[in] ref PCO reference. + * \return True if the reference is a vertex input register. + */ +static inline bool pco_ref_is_vtxin(pco_ref ref) +{ + return ref.type == PCO_REF_TYPE_REG && ref.reg_class == PCO_REG_CLASS_VTXIN; +} + /* PCO ref getters. */ /** * \brief Returns the pointee component of an indexed register reference. diff --git a/src/imagination/pco/pco_print.c b/src/imagination/pco/pco_print.c index 17ad1996f88..bd50c82bf0c 100644 --- a/src/imagination/pco/pco_print.c +++ b/src/imagination/pco/pco_print.c @@ -1187,8 +1187,10 @@ static void pco_print_func(pco_print_state *state, pco_func *func) { pco_printfi(state, "func"); pco_print_func_sig(state, func, false); - if (state->is_grouped) + if (state->is_grouped) { pco_printf(state, " /* temps: %u */", func->temps); + pco_printf(state, " /* vtxins: %u */", func->vtxins); + } pco_printf(state, "\n"); pco_printfi(state, "{\n"); diff --git a/src/imagination/pco/pco_ra.c b/src/imagination/pco/pco_ra.c index cca5e8f20ba..9173573dae3 100644 --- a/src/imagination/pco/pco_ra.c +++ b/src/imagination/pco/pco_ra.c @@ -461,9 +461,10 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) preproc_vecs(func); + unsigned num_rsvd_vtxins = func->parent_shader->data.common.vtxins; unsigned num_ssas = func->next_ssa; unsigned num_vregs = func->next_vreg; - unsigned num_vars = num_ssas + num_vregs; + unsigned num_vars = num_ssas + num_vregs + num_rsvd_vtxins; /* Collect used bit sizes. */ uint8_t used_bits = 0; @@ -492,7 +493,9 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) assert(only_32bit); struct ra_regs *ra_regs = - ra_alloc_reg_set(func, ctx->allocable_temps, !only_32bit); + ra_alloc_reg_set(func, + ctx->allocable_temps + ctx->allocable_vtxins, + !only_32bit); BITSET_WORD *comps = rzalloc_array_size(ra_regs, sizeof(*comps), BITSET_WORDS(num_ssas)); @@ -609,6 +612,11 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) for (unsigned t = 0; t < ctx->allocable_temps - (stride - 1); ++t) ra_class_add_reg(ra_class, t); + + for (unsigned t = ctx->allocable_temps; + t < ctx->allocable_temps + ctx->allocable_vtxins - (stride - 1); + ++t) + ra_class_add_reg(ra_class, t); } ra_set_finalize(ra_regs, NULL); @@ -686,6 +694,26 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) live_ranges[src.val].end = MAX2(live_ranges[src.val].end, instr->index); } + + /* Ensure that vertex input registers with pre-initialised data are not + * clobbered too early. + */ + if (ctx->allocable_vtxins > 0) { + pco_foreach_instr_src_vtxin_reg (psrc, instr) { + pco_ref src = *psrc; + + /* Place vtxin regs after ssa vars and vregs. */ + src.val += num_ssas + num_vregs; + + live_ranges[src.val].end = + MAX2(live_ranges[src.val].end, instr->index); + live_ranges[src.val].start = 0; + + ra_set_node_reg(ra_graph, + src.val, + psrc->val + ctx->allocable_temps); + } + } } /* Extend lifetimes of non-overriden vecs that have comp instructions. */ @@ -898,16 +926,31 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) } for (unsigned u = 0; u < chans; ++u) { - pco_ref dest = - pco_ref_hwreg(temp_dest_base + offset, PCO_REG_CLASS_TEMP); + pco_ref dest; + if (temp_dest_base + offset >= ctx->allocable_temps) { + dest = pco_ref_hwreg(temp_dest_base + offset - + ctx->allocable_temps, + PCO_REG_CLASS_VTXIN); + } else { + dest = pco_ref_hwreg(temp_dest_base + offset, + PCO_REG_CLASS_TEMP); + } + dest = pco_ref_offset(dest, u); dest = pco_ref_offset(dest, ctx->temp_alloc_offset); pco_ref src; - if (pco_ref_is_ssa(*psrc) || pco_ref_is_vreg(*psrc)) - src = pco_ref_hwreg(temp_src_base, PCO_REG_CLASS_TEMP); - else + if (pco_ref_is_ssa(*psrc) || pco_ref_is_vreg(*psrc)) { + if (temp_src_base >= ctx->allocable_temps) { + src = + pco_ref_hwreg(temp_src_base - ctx->allocable_temps, + PCO_REG_CLASS_VTXIN); + } else { + src = pco_ref_hwreg(temp_src_base, PCO_REG_CLASS_TEMP); + } + } else { src = pco_ref_chans(*psrc, 1); + } src = pco_ref_offset(src, u); src = pco_ref_offset(src, ctx->temp_alloc_offset); @@ -981,7 +1024,15 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) pdest->type = PCO_REF_TYPE_REG; pdest->reg_class = PCO_REG_CLASS_TEMP; pdest->val = val + ctx->temp_alloc_offset; - temps = MAX2(temps, dest_temps + ctx->temp_alloc_offset); + + /* Got a vertex input register. */ + if (val >= ctx->allocable_temps) { + pdest->reg_class = PCO_REG_CLASS_VTXIN; + pdest->val = val - ctx->allocable_temps; + vtxins = MAX2(vtxins, dest_temps - ctx->allocable_temps); + } else { + temps = MAX2(temps, dest_temps + ctx->temp_alloc_offset); + } } pco_foreach_instr_src_ssa (psrc, instr) { @@ -996,6 +1047,12 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) psrc->type = PCO_REF_TYPE_REG; psrc->reg_class = PCO_REG_CLASS_TEMP; psrc->val = val + ctx->temp_alloc_offset; + + /* Got a vertex input register. */ + if (val >= ctx->allocable_temps) { + psrc->reg_class = PCO_REG_CLASS_VTXIN; + psrc->val = val - ctx->allocable_temps; + } } pco_foreach_instr_dest_vreg (pdest, instr) { @@ -1005,7 +1062,15 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) pdest->type = PCO_REF_TYPE_REG; pdest->reg_class = PCO_REG_CLASS_TEMP; pdest->val = val + ctx->temp_alloc_offset; - temps = MAX2(temps, dest_temps); + + /* Got a vertex input register. */ + if (val >= ctx->allocable_temps) { + pdest->reg_class = PCO_REG_CLASS_VTXIN; + pdest->val = val - ctx->allocable_temps; + vtxins = MAX2(vtxins, dest_temps - ctx->allocable_temps); + } else { + temps = MAX2(temps, dest_temps); + } } pco_foreach_instr_src_vreg (psrc, instr) { @@ -1014,6 +1079,12 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) psrc->type = PCO_REF_TYPE_REG; psrc->reg_class = PCO_REG_CLASS_TEMP; psrc->val = val + ctx->temp_alloc_offset; + + /* Got a vertex input register. */ + if (val >= ctx->allocable_temps) { + psrc->reg_class = PCO_REG_CLASS_VTXIN; + psrc->val = val - ctx->allocable_temps; + } } /* Drop no-ops. */ @@ -1027,6 +1098,7 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) ralloc_free(ra_regs); func->temps = temps; + func->vtxins = vtxins; if (pco_should_print_shader(func->parent_shader) && PCO_DEBUG_PRINT(RA)) { printf( @@ -1060,12 +1132,27 @@ bool pco_ra(pco_shader *shader) * unsigned opt_temps = rogue_get_optimal_temps(shader->ctx->dev_info); */ + /* If any vertex input registers are already used, round up to the nearest + * multiple of 4 as vertex input registers are allocated in blocks of 4. + * + * This number is used by default as the maximum safe number of vertex input + * registers that can be used is not currently known. + */ + unsigned hw_vtxins = ALIGN_POT(shader->data.common.vtxins, + ROGUE_USRM_GRANULARITY_IN_REGISTERS); + + if (shader->stage != MESA_SHADER_FRAGMENT && !shader->is_internal) { + if (PCO_DEBUG(ALLOC_EXTRA_VTXINS)) { + hw_vtxins = MAX2(hw_vtxins, rogue_get_vtxins()); + } + } + /* TODO: different number of temps available if preamble/phase change. */ /* TODO: different number of temps available if barriers are in use. */ - /* TODO: support for internal and vtxin registers. */ + /* TODO: support for internal registers. */ pco_ra_ctx ctx = { .allocable_temps = hw_temps, - .allocable_vtxins = 0, + .allocable_vtxins = hw_vtxins, .allocable_interns = 0, }; @@ -1088,6 +1175,8 @@ bool pco_ra(pco_shader *shader) progress |= pco_ra_func(func, &ctx); shader->data.common.temps = MAX2(shader->data.common.temps, func->temps); + shader->data.common.vtxins = + MAX2(shader->data.common.vtxins, func->vtxins); } shader->data.common.spilled_temps = ctx.spilled_temps; diff --git a/src/util/shader_stats.xml b/src/util/shader_stats.xml index 137f2ffb335..340f72fbe42 100644 --- a/src/util/shader_stats.xml +++ b/src/util/shader_stats.xml @@ -200,6 +200,7 @@ Scratch size per instance in bytes Number of spilled registers per instance Number of allocated temp registers + Number of used vertex input registers Number of not unrolled loops in the shader Total number of instruction groups Number of main instruction groups