mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-03-11 17:50:32 +01:00
pco: Use vertex input registers in register allocation
Add support for the use of vertex input registers as additional general purpose registers which previously was restricted to temporary registers. Use of vertex input registers as additional general purpose registers is not available for fragment shaders. Vertex input registers are similar to temporary registers. The only difference is that vertex input registers can contain pre-initialised data when the shader starts. By default, the number of vertex input registers used for register allocation is the number of vertex input registers used for their pre-initialised data rounded up to the nearest multiple of 4, as vertex input registers are allocated in blocks of 4. If PCO_DEBUG=alloc_extra_vtxins is used, a mimimum of 12 vertex input registers are available for register allocation. Signed-off-by: Duncan Brawley <duncan.brawley@imgtec.com> Reviewed-by: Simon Perretta <simon.perretta@imgtec.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39886>
This commit is contained in:
parent
8592c177d1
commit
0ea39c6305
9 changed files with 135 additions and 12 deletions
|
|
@ -2139,6 +2139,9 @@ PowerVR driver environment variables
|
|||
``no_pred_cf``
|
||||
No predicated execution in CF.
|
||||
|
||||
``alloc_extra_vtxins``
|
||||
Allocates additional vertex input registers.
|
||||
|
||||
.. envvar:: PCO_SKIP_PASSES
|
||||
|
||||
A comma-separated list of passes to skip.
|
||||
|
|
|
|||
|
|
@ -294,6 +294,7 @@ ForEachMacros: [
|
|||
'pco_foreach_instr_src_ssa_from',
|
||||
'pco_foreach_instr_src_vreg',
|
||||
'pco_foreach_instr_src_vreg_ssa',
|
||||
'pco_foreach_instr_src_vtxin_reg',
|
||||
'pco_foreach_loop_in_func',
|
||||
'pco_foreach_loop_in_func_from',
|
||||
'pco_foreach_loop_in_func_from_rev',
|
||||
|
|
|
|||
|
|
@ -478,6 +478,12 @@ rogue_max_wg_temps(const struct pvr_device_info *dev_info,
|
|||
return temps;
|
||||
}
|
||||
|
||||
static inline uint32_t rogue_get_vtxins(void)
|
||||
{
|
||||
/* TODO: use highest safe number of vertex input registers. */
|
||||
return 12;
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
rogue_num_uscs_per_tile(const struct pvr_device_info *dev_info)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -370,6 +370,7 @@ struct pvr_stats pco_get_pvr_stats(pco_shader *shader)
|
|||
.scratch_size = shader->data.common.scratch,
|
||||
.spill_count = shader->data.common.spilled_temps,
|
||||
.temp_count = shader->data.common.temps,
|
||||
.vtxin_count = shader->data.common.vtxins,
|
||||
.loop_count = loop_count,
|
||||
.inst_group_count = igrp_count,
|
||||
.main_inst_group_count = main_count,
|
||||
|
|
|
|||
|
|
@ -26,6 +26,9 @@ static const struct debug_named_value pco_debug_options[] = {
|
|||
{ "val_skip", PCO_DEBUG_VAL_SKIP, "Skip IR validation." },
|
||||
{ "reindex", PCO_DEBUG_REINDEX, "Reindex IR at the end of each pass." },
|
||||
{ "no_pred_cf", PCO_DEBUG_NO_PRED_CF, "No predicated execution in CF." },
|
||||
{ "alloc_extra_vtxins",
|
||||
PCO_DEBUG_ALLOC_EXTRA_VTXINS,
|
||||
"Allocates additional vertex input registers." },
|
||||
DEBUG_NAMED_VALUE_END,
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ enum pco_debug {
|
|||
PCO_DEBUG_VAL_SKIP = BITFIELD64_BIT(0),
|
||||
PCO_DEBUG_REINDEX = BITFIELD64_BIT(1),
|
||||
PCO_DEBUG_NO_PRED_CF = BITFIELD64_BIT(2),
|
||||
PCO_DEBUG_ALLOC_EXTRA_VTXINS = BITFIELD64_BIT(3),
|
||||
};
|
||||
|
||||
extern uint64_t pco_debug;
|
||||
|
|
@ -353,6 +354,7 @@ typedef struct _pco_func {
|
|||
unsigned next_loop; /** Next loop index. */
|
||||
|
||||
unsigned temps; /** Number of temps allocated. */
|
||||
unsigned vtxins; /** Number of vertex input registers used. */
|
||||
|
||||
pco_ref emc; /** Execution mask counter register. */
|
||||
|
||||
|
|
@ -731,6 +733,10 @@ PCO_DEFINE_CAST(pco_cf_node_as_func,
|
|||
pco_foreach_instr_src (psrc, instr) \
|
||||
if (pco_ref_is_hwreg(*psrc))
|
||||
|
||||
#define pco_foreach_instr_src_vtxin_reg(psrc, instr) \
|
||||
pco_foreach_instr_src (psrc, instr) \
|
||||
if (pco_ref_is_vtxin(*psrc))
|
||||
|
||||
#define pco_cf_node_head(list) list_first_entry(list, pco_cf_node, link)
|
||||
#define pco_cf_node_tail(list) list_last_entry(list, pco_cf_node, link)
|
||||
|
||||
|
|
@ -1985,6 +1991,17 @@ static inline bool pco_ref_is_scalar(pco_ref ref)
|
|||
return !ref.chans;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Return whether a reference is a vertex input register.
|
||||
*
|
||||
* \param[in] ref PCO reference.
|
||||
* \return True if the reference is a vertex input register.
|
||||
*/
|
||||
static inline bool pco_ref_is_vtxin(pco_ref ref)
|
||||
{
|
||||
return ref.type == PCO_REF_TYPE_REG && ref.reg_class == PCO_REG_CLASS_VTXIN;
|
||||
}
|
||||
|
||||
/* PCO ref getters. */
|
||||
/**
|
||||
* \brief Returns the pointee component of an indexed register reference.
|
||||
|
|
|
|||
|
|
@ -1187,8 +1187,10 @@ static void pco_print_func(pco_print_state *state, pco_func *func)
|
|||
{
|
||||
pco_printfi(state, "func");
|
||||
pco_print_func_sig(state, func, false);
|
||||
if (state->is_grouped)
|
||||
if (state->is_grouped) {
|
||||
pco_printf(state, " /* temps: %u */", func->temps);
|
||||
pco_printf(state, " /* vtxins: %u */", func->vtxins);
|
||||
}
|
||||
pco_printf(state, "\n");
|
||||
|
||||
pco_printfi(state, "{\n");
|
||||
|
|
|
|||
|
|
@ -461,9 +461,10 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx)
|
|||
|
||||
preproc_vecs(func);
|
||||
|
||||
unsigned num_rsvd_vtxins = func->parent_shader->data.common.vtxins;
|
||||
unsigned num_ssas = func->next_ssa;
|
||||
unsigned num_vregs = func->next_vreg;
|
||||
unsigned num_vars = num_ssas + num_vregs;
|
||||
unsigned num_vars = num_ssas + num_vregs + num_rsvd_vtxins;
|
||||
|
||||
/* Collect used bit sizes. */
|
||||
uint8_t used_bits = 0;
|
||||
|
|
@ -492,7 +493,9 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx)
|
|||
assert(only_32bit);
|
||||
|
||||
struct ra_regs *ra_regs =
|
||||
ra_alloc_reg_set(func, ctx->allocable_temps, !only_32bit);
|
||||
ra_alloc_reg_set(func,
|
||||
ctx->allocable_temps + ctx->allocable_vtxins,
|
||||
!only_32bit);
|
||||
|
||||
BITSET_WORD *comps =
|
||||
rzalloc_array_size(ra_regs, sizeof(*comps), BITSET_WORDS(num_ssas));
|
||||
|
|
@ -609,6 +612,11 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx)
|
|||
|
||||
for (unsigned t = 0; t < ctx->allocable_temps - (stride - 1); ++t)
|
||||
ra_class_add_reg(ra_class, t);
|
||||
|
||||
for (unsigned t = ctx->allocable_temps;
|
||||
t < ctx->allocable_temps + ctx->allocable_vtxins - (stride - 1);
|
||||
++t)
|
||||
ra_class_add_reg(ra_class, t);
|
||||
}
|
||||
|
||||
ra_set_finalize(ra_regs, NULL);
|
||||
|
|
@ -686,6 +694,26 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx)
|
|||
live_ranges[src.val].end =
|
||||
MAX2(live_ranges[src.val].end, instr->index);
|
||||
}
|
||||
|
||||
/* Ensure that vertex input registers with pre-initialised data are not
|
||||
* clobbered too early.
|
||||
*/
|
||||
if (ctx->allocable_vtxins > 0) {
|
||||
pco_foreach_instr_src_vtxin_reg (psrc, instr) {
|
||||
pco_ref src = *psrc;
|
||||
|
||||
/* Place vtxin regs after ssa vars and vregs. */
|
||||
src.val += num_ssas + num_vregs;
|
||||
|
||||
live_ranges[src.val].end =
|
||||
MAX2(live_ranges[src.val].end, instr->index);
|
||||
live_ranges[src.val].start = 0;
|
||||
|
||||
ra_set_node_reg(ra_graph,
|
||||
src.val,
|
||||
psrc->val + ctx->allocable_temps);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Extend lifetimes of non-overriden vecs that have comp instructions. */
|
||||
|
|
@ -898,16 +926,31 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx)
|
|||
}
|
||||
|
||||
for (unsigned u = 0; u < chans; ++u) {
|
||||
pco_ref dest =
|
||||
pco_ref_hwreg(temp_dest_base + offset, PCO_REG_CLASS_TEMP);
|
||||
pco_ref dest;
|
||||
if (temp_dest_base + offset >= ctx->allocable_temps) {
|
||||
dest = pco_ref_hwreg(temp_dest_base + offset -
|
||||
ctx->allocable_temps,
|
||||
PCO_REG_CLASS_VTXIN);
|
||||
} else {
|
||||
dest = pco_ref_hwreg(temp_dest_base + offset,
|
||||
PCO_REG_CLASS_TEMP);
|
||||
}
|
||||
|
||||
dest = pco_ref_offset(dest, u);
|
||||
dest = pco_ref_offset(dest, ctx->temp_alloc_offset);
|
||||
|
||||
pco_ref src;
|
||||
if (pco_ref_is_ssa(*psrc) || pco_ref_is_vreg(*psrc))
|
||||
src = pco_ref_hwreg(temp_src_base, PCO_REG_CLASS_TEMP);
|
||||
else
|
||||
if (pco_ref_is_ssa(*psrc) || pco_ref_is_vreg(*psrc)) {
|
||||
if (temp_src_base >= ctx->allocable_temps) {
|
||||
src =
|
||||
pco_ref_hwreg(temp_src_base - ctx->allocable_temps,
|
||||
PCO_REG_CLASS_VTXIN);
|
||||
} else {
|
||||
src = pco_ref_hwreg(temp_src_base, PCO_REG_CLASS_TEMP);
|
||||
}
|
||||
} else {
|
||||
src = pco_ref_chans(*psrc, 1);
|
||||
}
|
||||
|
||||
src = pco_ref_offset(src, u);
|
||||
src = pco_ref_offset(src, ctx->temp_alloc_offset);
|
||||
|
|
@ -981,7 +1024,15 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx)
|
|||
pdest->type = PCO_REF_TYPE_REG;
|
||||
pdest->reg_class = PCO_REG_CLASS_TEMP;
|
||||
pdest->val = val + ctx->temp_alloc_offset;
|
||||
temps = MAX2(temps, dest_temps + ctx->temp_alloc_offset);
|
||||
|
||||
/* Got a vertex input register. */
|
||||
if (val >= ctx->allocable_temps) {
|
||||
pdest->reg_class = PCO_REG_CLASS_VTXIN;
|
||||
pdest->val = val - ctx->allocable_temps;
|
||||
vtxins = MAX2(vtxins, dest_temps - ctx->allocable_temps);
|
||||
} else {
|
||||
temps = MAX2(temps, dest_temps + ctx->temp_alloc_offset);
|
||||
}
|
||||
}
|
||||
|
||||
pco_foreach_instr_src_ssa (psrc, instr) {
|
||||
|
|
@ -996,6 +1047,12 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx)
|
|||
psrc->type = PCO_REF_TYPE_REG;
|
||||
psrc->reg_class = PCO_REG_CLASS_TEMP;
|
||||
psrc->val = val + ctx->temp_alloc_offset;
|
||||
|
||||
/* Got a vertex input register. */
|
||||
if (val >= ctx->allocable_temps) {
|
||||
psrc->reg_class = PCO_REG_CLASS_VTXIN;
|
||||
psrc->val = val - ctx->allocable_temps;
|
||||
}
|
||||
}
|
||||
|
||||
pco_foreach_instr_dest_vreg (pdest, instr) {
|
||||
|
|
@ -1005,7 +1062,15 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx)
|
|||
pdest->type = PCO_REF_TYPE_REG;
|
||||
pdest->reg_class = PCO_REG_CLASS_TEMP;
|
||||
pdest->val = val + ctx->temp_alloc_offset;
|
||||
temps = MAX2(temps, dest_temps);
|
||||
|
||||
/* Got a vertex input register. */
|
||||
if (val >= ctx->allocable_temps) {
|
||||
pdest->reg_class = PCO_REG_CLASS_VTXIN;
|
||||
pdest->val = val - ctx->allocable_temps;
|
||||
vtxins = MAX2(vtxins, dest_temps - ctx->allocable_temps);
|
||||
} else {
|
||||
temps = MAX2(temps, dest_temps);
|
||||
}
|
||||
}
|
||||
|
||||
pco_foreach_instr_src_vreg (psrc, instr) {
|
||||
|
|
@ -1014,6 +1079,12 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx)
|
|||
psrc->type = PCO_REF_TYPE_REG;
|
||||
psrc->reg_class = PCO_REG_CLASS_TEMP;
|
||||
psrc->val = val + ctx->temp_alloc_offset;
|
||||
|
||||
/* Got a vertex input register. */
|
||||
if (val >= ctx->allocable_temps) {
|
||||
psrc->reg_class = PCO_REG_CLASS_VTXIN;
|
||||
psrc->val = val - ctx->allocable_temps;
|
||||
}
|
||||
}
|
||||
|
||||
/* Drop no-ops. */
|
||||
|
|
@ -1027,6 +1098,7 @@ static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx)
|
|||
ralloc_free(ra_regs);
|
||||
|
||||
func->temps = temps;
|
||||
func->vtxins = vtxins;
|
||||
|
||||
if (pco_should_print_shader(func->parent_shader) && PCO_DEBUG_PRINT(RA)) {
|
||||
printf(
|
||||
|
|
@ -1060,12 +1132,27 @@ bool pco_ra(pco_shader *shader)
|
|||
* unsigned opt_temps = rogue_get_optimal_temps(shader->ctx->dev_info);
|
||||
*/
|
||||
|
||||
/* If any vertex input registers are already used, round up to the nearest
|
||||
* multiple of 4 as vertex input registers are allocated in blocks of 4.
|
||||
*
|
||||
* This number is used by default as the maximum safe number of vertex input
|
||||
* registers that can be used is not currently known.
|
||||
*/
|
||||
unsigned hw_vtxins = ALIGN_POT(shader->data.common.vtxins,
|
||||
ROGUE_USRM_GRANULARITY_IN_REGISTERS);
|
||||
|
||||
if (shader->stage != MESA_SHADER_FRAGMENT && !shader->is_internal) {
|
||||
if (PCO_DEBUG(ALLOC_EXTRA_VTXINS)) {
|
||||
hw_vtxins = MAX2(hw_vtxins, rogue_get_vtxins());
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: different number of temps available if preamble/phase change. */
|
||||
/* TODO: different number of temps available if barriers are in use. */
|
||||
/* TODO: support for internal and vtxin registers. */
|
||||
/* TODO: support for internal registers. */
|
||||
pco_ra_ctx ctx = {
|
||||
.allocable_temps = hw_temps,
|
||||
.allocable_vtxins = 0,
|
||||
.allocable_vtxins = hw_vtxins,
|
||||
.allocable_interns = 0,
|
||||
};
|
||||
|
||||
|
|
@ -1088,6 +1175,8 @@ bool pco_ra(pco_shader *shader)
|
|||
progress |= pco_ra_func(func, &ctx);
|
||||
|
||||
shader->data.common.temps = MAX2(shader->data.common.temps, func->temps);
|
||||
shader->data.common.vtxins =
|
||||
MAX2(shader->data.common.vtxins, func->vtxins);
|
||||
}
|
||||
|
||||
shader->data.common.spilled_temps = ctx.spilled_temps;
|
||||
|
|
|
|||
|
|
@ -200,6 +200,7 @@
|
|||
<stat name="Scratch size">Scratch size per instance in bytes</stat>
|
||||
<stat name="Spill count">Number of spilled registers per instance</stat>
|
||||
<stat name="Temp count">Number of allocated temp registers</stat>
|
||||
<stat name="Vtxin count">Number of used vertex input registers</stat>
|
||||
<stat name="Loop count">Number of not unrolled loops in the shader</stat>
|
||||
<stat name="Inst group count">Total number of instruction groups</stat>
|
||||
<stat name="Main inst group count">Number of main instruction groups</stat>
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue