From 0ea39c6305abc2eac2b3e7859f4859c770cfbb4a Mon Sep 17 00:00:00 2001 From: Duncan Brawley Date: Wed, 18 Feb 2026 17:16:12 +0000 Subject: [PATCH] pco: Use vertex input registers in register allocation Add support for the use of vertex input registers as additional general purpose registers which previously was restricted to temporary registers. Use of vertex input registers as additional general purpose registers is not available for fragment shaders. Vertex input registers are similar to temporary registers. The only difference is that vertex input registers can contain pre-initialised data when the shader starts. By default, the number of vertex input registers used for register allocation is the number of vertex input registers used for their pre-initialised data rounded up to the nearest multiple of 4, as vertex input registers are allocated in blocks of 4. If PCO_DEBUG=alloc_extra_vtxins is used, a mimimum of 12 vertex input registers are available for register allocation. Signed-off-by: Duncan Brawley Reviewed-by: Simon Perretta Part-of: --- docs/envvars.rst | 3 + src/imagination/.clang-format | 1 + .../include/hwdef/rogue_hw_utils.h | 6 + src/imagination/pco/pco.c | 1 + src/imagination/pco/pco_debug.c | 3 + src/imagination/pco/pco_internal.h | 17 +++ src/imagination/pco/pco_print.c | 4 +- src/imagination/pco/pco_ra.c | 111 ++++++++++++++++-- src/util/shader_stats.xml | 1 + 9 files changed, 135 insertions(+), 12 deletions(-) diff --git a/docs/envvars.rst b/docs/envvars.rst index d3ad9b5985c..dbaaa84b9dc 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -2139,6 +2139,9 @@ PowerVR driver environment variables ``no_pred_cf`` No predicated execution in CF. + ``alloc_extra_vtxins`` + Allocates additional vertex input registers. + .. envvar:: PCO_SKIP_PASSES A comma-separated list of passes to skip. diff --git a/src/imagination/.clang-format b/src/imagination/.clang-format index 30ed663d997..c918aaaef41 100644 --- a/src/imagination/.clang-format +++ b/src/imagination/.clang-format @@ -294,6 +294,7 @@ ForEachMacros: [ 'pco_foreach_instr_src_ssa_from', 'pco_foreach_instr_src_vreg', 'pco_foreach_instr_src_vreg_ssa', + 'pco_foreach_instr_src_vtxin_reg', 'pco_foreach_loop_in_func', 'pco_foreach_loop_in_func_from', 'pco_foreach_loop_in_func_from_rev', diff --git a/src/imagination/include/hwdef/rogue_hw_utils.h b/src/imagination/include/hwdef/rogue_hw_utils.h index 6324cd56c42..a859059e9b6 100644 --- a/src/imagination/include/hwdef/rogue_hw_utils.h +++ b/src/imagination/include/hwdef/rogue_hw_utils.h @@ -478,6 +478,12 @@ rogue_max_wg_temps(const struct pvr_device_info *dev_info, return temps; } +static inline uint32_t rogue_get_vtxins(void) +{ + /* TODO: use highest safe number of vertex input registers. */ + return 12; +} + static inline uint32_t rogue_num_uscs_per_tile(const struct pvr_device_info *dev_info) { diff --git a/src/imagination/pco/pco.c b/src/imagination/pco/pco.c index 42c76bd9db3..4274442b362 100644 --- a/src/imagination/pco/pco.c +++ b/src/imagination/pco/pco.c @@ -370,6 +370,7 @@ struct pvr_stats pco_get_pvr_stats(pco_shader *shader) .scratch_size = shader->data.common.scratch, .spill_count = shader->data.common.spilled_temps, .temp_count = shader->data.common.temps, + .vtxin_count = shader->data.common.vtxins, .loop_count = loop_count, .inst_group_count = igrp_count, .main_inst_group_count = main_count, diff --git a/src/imagination/pco/pco_debug.c b/src/imagination/pco/pco_debug.c index 54f5c254e35..c6d48c56cfe 100644 --- a/src/imagination/pco/pco_debug.c +++ b/src/imagination/pco/pco_debug.c @@ -26,6 +26,9 @@ static const struct debug_named_value pco_debug_options[] = { { "val_skip", PCO_DEBUG_VAL_SKIP, "Skip IR validation." }, { "reindex", PCO_DEBUG_REINDEX, "Reindex IR at the end of each pass." }, { "no_pred_cf", PCO_DEBUG_NO_PRED_CF, "No predicated execution in CF." }, + { "alloc_extra_vtxins", + PCO_DEBUG_ALLOC_EXTRA_VTXINS, + "Allocates additional vertex input registers." }, DEBUG_NAMED_VALUE_END, }; diff --git a/src/imagination/pco/pco_internal.h b/src/imagination/pco/pco_internal.h index fb900c3bb14..49987c4f573 100644 --- a/src/imagination/pco/pco_internal.h +++ b/src/imagination/pco/pco_internal.h @@ -57,6 +57,7 @@ enum pco_debug { PCO_DEBUG_VAL_SKIP = BITFIELD64_BIT(0), PCO_DEBUG_REINDEX = BITFIELD64_BIT(1), PCO_DEBUG_NO_PRED_CF = BITFIELD64_BIT(2), + PCO_DEBUG_ALLOC_EXTRA_VTXINS = BITFIELD64_BIT(3), }; extern uint64_t pco_debug; @@ -353,6 +354,7 @@ typedef struct _pco_func { unsigned next_loop; /** Next loop index. */ unsigned temps; /** Number of temps allocated. */ + unsigned vtxins; /** Number of vertex input registers used. */ pco_ref emc; /** Execution mask counter register. */ @@ -731,6 +733,10 @@ PCO_DEFINE_CAST(pco_cf_node_as_func, pco_foreach_instr_src (psrc, instr) \ if (pco_ref_is_hwreg(*psrc)) +#define pco_foreach_instr_src_vtxin_reg(psrc, instr) \ + pco_foreach_instr_src (psrc, instr) \ + if (pco_ref_is_vtxin(*psrc)) + #define pco_cf_node_head(list) list_first_entry(list, pco_cf_node, link) #define pco_cf_node_tail(list) list_last_entry(list, pco_cf_node, link) @@ -1985,6 +1991,17 @@ static inline bool pco_ref_is_scalar(pco_ref ref) return !ref.chans; } +/** + * \brief Return whether a reference is a vertex input register. + * + * \param[in] ref PCO reference. + * \return True if the reference is a vertex input register. + */ +static inline bool pco_ref_is_vtxin(pco_ref ref) +{ + return ref.type == PCO_REF_TYPE_REG && ref.reg_class == PCO_REG_CLASS_VTXIN; +} + /* PCO ref getters. */ /** * \brief Returns the pointee component of an indexed register reference. diff --git a/src/imagination/pco/pco_print.c b/src/imagination/pco/pco_print.c index 17ad1996f88..bd50c82bf0c 100644 --- a/src/imagination/pco/pco_print.c +++ b/src/imagination/pco/pco_print.c @@ -1187,8 +1187,10 @@ static void pco_print_func(pco_print_state *state, pco_func *func) { pco_printfi(state, "func"); pco_print_func_sig(state, func, false); - if (state->is_grouped) + if (state->is_grouped) { pco_printf(state, " /* temps: %u */", func->temps); + pco_printf(state, " /* vtxins: %u */", func->vtxins); + } pco_printf(state, "\n"); pco_printfi(state, "{\n"); diff --git a/src/imagination/pco/pco_ra.c b/src/imagination/pco/pco_ra.c index cca5e8f20ba..9173573dae3 100644 --- a/src/imagination/pco/pco_ra.c +++ b/src/imagination/pco/pco_ra.c @@ -461,9 +461,10 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) preproc_vecs(func); + unsigned num_rsvd_vtxins = func->parent_shader->data.common.vtxins; unsigned num_ssas = func->next_ssa; unsigned num_vregs = func->next_vreg; - unsigned num_vars = num_ssas + num_vregs; + unsigned num_vars = num_ssas + num_vregs + num_rsvd_vtxins; /* Collect used bit sizes. */ uint8_t used_bits = 0; @@ -492,7 +493,9 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) assert(only_32bit); struct ra_regs *ra_regs = - ra_alloc_reg_set(func, ctx->allocable_temps, !only_32bit); + ra_alloc_reg_set(func, + ctx->allocable_temps + ctx->allocable_vtxins, + !only_32bit); BITSET_WORD *comps = rzalloc_array_size(ra_regs, sizeof(*comps), BITSET_WORDS(num_ssas)); @@ -609,6 +612,11 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) for (unsigned t = 0; t < ctx->allocable_temps - (stride - 1); ++t) ra_class_add_reg(ra_class, t); + + for (unsigned t = ctx->allocable_temps; + t < ctx->allocable_temps + ctx->allocable_vtxins - (stride - 1); + ++t) + ra_class_add_reg(ra_class, t); } ra_set_finalize(ra_regs, NULL); @@ -686,6 +694,26 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) live_ranges[src.val].end = MAX2(live_ranges[src.val].end, instr->index); } + + /* Ensure that vertex input registers with pre-initialised data are not + * clobbered too early. + */ + if (ctx->allocable_vtxins > 0) { + pco_foreach_instr_src_vtxin_reg (psrc, instr) { + pco_ref src = *psrc; + + /* Place vtxin regs after ssa vars and vregs. */ + src.val += num_ssas + num_vregs; + + live_ranges[src.val].end = + MAX2(live_ranges[src.val].end, instr->index); + live_ranges[src.val].start = 0; + + ra_set_node_reg(ra_graph, + src.val, + psrc->val + ctx->allocable_temps); + } + } } /* Extend lifetimes of non-overriden vecs that have comp instructions. */ @@ -898,16 +926,31 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) } for (unsigned u = 0; u < chans; ++u) { - pco_ref dest = - pco_ref_hwreg(temp_dest_base + offset, PCO_REG_CLASS_TEMP); + pco_ref dest; + if (temp_dest_base + offset >= ctx->allocable_temps) { + dest = pco_ref_hwreg(temp_dest_base + offset - + ctx->allocable_temps, + PCO_REG_CLASS_VTXIN); + } else { + dest = pco_ref_hwreg(temp_dest_base + offset, + PCO_REG_CLASS_TEMP); + } + dest = pco_ref_offset(dest, u); dest = pco_ref_offset(dest, ctx->temp_alloc_offset); pco_ref src; - if (pco_ref_is_ssa(*psrc) || pco_ref_is_vreg(*psrc)) - src = pco_ref_hwreg(temp_src_base, PCO_REG_CLASS_TEMP); - else + if (pco_ref_is_ssa(*psrc) || pco_ref_is_vreg(*psrc)) { + if (temp_src_base >= ctx->allocable_temps) { + src = + pco_ref_hwreg(temp_src_base - ctx->allocable_temps, + PCO_REG_CLASS_VTXIN); + } else { + src = pco_ref_hwreg(temp_src_base, PCO_REG_CLASS_TEMP); + } + } else { src = pco_ref_chans(*psrc, 1); + } src = pco_ref_offset(src, u); src = pco_ref_offset(src, ctx->temp_alloc_offset); @@ -981,7 +1024,15 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) pdest->type = PCO_REF_TYPE_REG; pdest->reg_class = PCO_REG_CLASS_TEMP; pdest->val = val + ctx->temp_alloc_offset; - temps = MAX2(temps, dest_temps + ctx->temp_alloc_offset); + + /* Got a vertex input register. */ + if (val >= ctx->allocable_temps) { + pdest->reg_class = PCO_REG_CLASS_VTXIN; + pdest->val = val - ctx->allocable_temps; + vtxins = MAX2(vtxins, dest_temps - ctx->allocable_temps); + } else { + temps = MAX2(temps, dest_temps + ctx->temp_alloc_offset); + } } pco_foreach_instr_src_ssa (psrc, instr) { @@ -996,6 +1047,12 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) psrc->type = PCO_REF_TYPE_REG; psrc->reg_class = PCO_REG_CLASS_TEMP; psrc->val = val + ctx->temp_alloc_offset; + + /* Got a vertex input register. */ + if (val >= ctx->allocable_temps) { + psrc->reg_class = PCO_REG_CLASS_VTXIN; + psrc->val = val - ctx->allocable_temps; + } } pco_foreach_instr_dest_vreg (pdest, instr) { @@ -1005,7 +1062,15 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) pdest->type = PCO_REF_TYPE_REG; pdest->reg_class = PCO_REG_CLASS_TEMP; pdest->val = val + ctx->temp_alloc_offset; - temps = MAX2(temps, dest_temps); + + /* Got a vertex input register. */ + if (val >= ctx->allocable_temps) { + pdest->reg_class = PCO_REG_CLASS_VTXIN; + pdest->val = val - ctx->allocable_temps; + vtxins = MAX2(vtxins, dest_temps - ctx->allocable_temps); + } else { + temps = MAX2(temps, dest_temps); + } } pco_foreach_instr_src_vreg (psrc, instr) { @@ -1014,6 +1079,12 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) psrc->type = PCO_REF_TYPE_REG; psrc->reg_class = PCO_REG_CLASS_TEMP; psrc->val = val + ctx->temp_alloc_offset; + + /* Got a vertex input register. */ + if (val >= ctx->allocable_temps) { + psrc->reg_class = PCO_REG_CLASS_VTXIN; + psrc->val = val - ctx->allocable_temps; + } } /* Drop no-ops. */ @@ -1027,6 +1098,7 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) ralloc_free(ra_regs); func->temps = temps; + func->vtxins = vtxins; if (pco_should_print_shader(func->parent_shader) && PCO_DEBUG_PRINT(RA)) { printf( @@ -1060,12 +1132,27 @@ bool pco_ra(pco_shader *shader) * unsigned opt_temps = rogue_get_optimal_temps(shader->ctx->dev_info); */ + /* If any vertex input registers are already used, round up to the nearest + * multiple of 4 as vertex input registers are allocated in blocks of 4. + * + * This number is used by default as the maximum safe number of vertex input + * registers that can be used is not currently known. + */ + unsigned hw_vtxins = ALIGN_POT(shader->data.common.vtxins, + ROGUE_USRM_GRANULARITY_IN_REGISTERS); + + if (shader->stage != MESA_SHADER_FRAGMENT && !shader->is_internal) { + if (PCO_DEBUG(ALLOC_EXTRA_VTXINS)) { + hw_vtxins = MAX2(hw_vtxins, rogue_get_vtxins()); + } + } + /* TODO: different number of temps available if preamble/phase change. */ /* TODO: different number of temps available if barriers are in use. */ - /* TODO: support for internal and vtxin registers. */ + /* TODO: support for internal registers. */ pco_ra_ctx ctx = { .allocable_temps = hw_temps, - .allocable_vtxins = 0, + .allocable_vtxins = hw_vtxins, .allocable_interns = 0, }; @@ -1088,6 +1175,8 @@ bool pco_ra(pco_shader *shader) progress |= pco_ra_func(func, &ctx); shader->data.common.temps = MAX2(shader->data.common.temps, func->temps); + shader->data.common.vtxins = + MAX2(shader->data.common.vtxins, func->vtxins); } shader->data.common.spilled_temps = ctx.spilled_temps; diff --git a/src/util/shader_stats.xml b/src/util/shader_stats.xml index 137f2ffb335..340f72fbe42 100644 --- a/src/util/shader_stats.xml +++ b/src/util/shader_stats.xml @@ -200,6 +200,7 @@ Scratch size per instance in bytes Number of spilled registers per instance Number of allocated temp registers + Number of used vertex input registers Number of not unrolled loops in the shader Total number of instruction groups Number of main instruction groups