mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-28 10:20:09 +01:00
ir3: optimize alias register allocation by reusing GPRs
Allocate alias registers for an alias group while trying to minimize the
number of needed aliases. That is, if the allocated GPRs for the group
are (partially) consecutive, only allocate aliases to fill-in the gaps.
For example:
sam ..., @{r1.x, r5.z, r1.z}, ...
only needs a single alias:
alias.tex.b32.0 r1.y, r5.z
sam ..., r1.x, ...
Also, try to reuse allocations of previous groups. For example, this is
relatively common:
sam ..., @{r2.z, 0}, @{0}
Reusing the allocation of the first group for the second one gives this:
alias.tex.b32.0 r2.w, 0
sam ..., r2.z, r2.w
Signed-off-by: Job Noorman <jnoorman@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31222>
This commit is contained in:
parent
3fb0f54d70
commit
9b6bca52d5
1 changed files with 335 additions and 6 deletions
|
|
@ -177,9 +177,17 @@ struct alias_table_entry {
|
|||
struct ir3_register *src;
|
||||
};
|
||||
|
||||
typedef BITSET_DECLARE(reg_bitset, GPR_REG_SIZE);
|
||||
|
||||
struct alias_table_state {
|
||||
struct alias_table_entry entries[16];
|
||||
unsigned num_entries;
|
||||
|
||||
/* The registers currently allocated for the instruction. Note that this
|
||||
* includes both alias registers as well as GPRs that are reused.
|
||||
*/
|
||||
reg_bitset full_alloc;
|
||||
reg_bitset half_alloc;
|
||||
};
|
||||
|
||||
static void
|
||||
|
|
@ -192,19 +200,340 @@ add_table_entry(struct alias_table_state *state, unsigned alias_reg,
|
|||
entry->src = src;
|
||||
}
|
||||
|
||||
static void
|
||||
clear_table(struct alias_table_state *state)
|
||||
{
|
||||
BITSET_ZERO(state->full_alloc);
|
||||
BITSET_ZERO(state->half_alloc);
|
||||
state->num_entries = 0;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
lookup_alias(struct alias_table_state *state, struct ir3_register *alias)
|
||||
{
|
||||
for (unsigned i = 0; i < state->num_entries; i++) {
|
||||
struct alias_table_entry *entry = &state->entries[i];
|
||||
unsigned match_flags = (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_HALF);
|
||||
|
||||
if ((alias->flags & match_flags) != (entry->src->flags & match_flags)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (alias->flags & IR3_REG_IMMED) {
|
||||
if (alias->uim_val == entry->src->uim_val) {
|
||||
return entry->alias_reg;
|
||||
}
|
||||
} else if (alias->num == entry->src->num) {
|
||||
return entry->alias_reg;
|
||||
}
|
||||
}
|
||||
|
||||
return INVALID_REG;
|
||||
}
|
||||
|
||||
/* Find existing entries in the alias table for all aliases in this alias group.
|
||||
* If all aliases are already in the table, and they are in consecutive
|
||||
* registers, we can simply reuse these registers without creating new table
|
||||
* entries.
|
||||
* TODO if there's a partial overlap between the start of the alias group and
|
||||
* the end of an existing allocation range, we might be able to partially reuse
|
||||
* table entries.
|
||||
*/
|
||||
static unsigned
|
||||
find_existing_alloc(struct alias_table_state *state,
|
||||
struct ir3_instruction *instr, unsigned first_src_n)
|
||||
{
|
||||
if (state->num_entries == 0) {
|
||||
return INVALID_REG;
|
||||
}
|
||||
|
||||
unsigned first_reg = INVALID_REG;
|
||||
|
||||
foreach_src_in_alias_group_n (alias, alias_n, instr, first_src_n) {
|
||||
unsigned reg = lookup_alias(state, alias);
|
||||
|
||||
if (reg == INVALID_REG) {
|
||||
return INVALID_REG;
|
||||
}
|
||||
|
||||
if (alias_n == 0) {
|
||||
first_reg = reg;
|
||||
} else if (reg != first_reg + alias_n) {
|
||||
return INVALID_REG;
|
||||
}
|
||||
}
|
||||
|
||||
assert(first_reg != INVALID_REG);
|
||||
return first_reg;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
find_free_alias_regs_in_range(const reg_bitset *alloc_regs,
|
||||
unsigned num_aliases, unsigned start,
|
||||
unsigned end)
|
||||
{
|
||||
assert(end >= num_aliases);
|
||||
|
||||
for (unsigned reg = start; reg < end - num_aliases; reg++) {
|
||||
if (!BITSET_TEST_RANGE(*alloc_regs, reg, reg + num_aliases - 1)) {
|
||||
return reg;
|
||||
}
|
||||
}
|
||||
|
||||
return INVALID_REG;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
find_free_alias_regs(const reg_bitset *alloc_regs, unsigned num_aliases)
|
||||
{
|
||||
unsigned reg = find_free_alias_regs_in_range(alloc_regs, num_aliases,
|
||||
FIRST_ALIAS_REG, GPR_REG_SIZE);
|
||||
|
||||
if (reg != INVALID_REG) {
|
||||
return reg;
|
||||
}
|
||||
|
||||
return find_free_alias_regs_in_range(alloc_regs, num_aliases, 0,
|
||||
FIRST_ALIAS_REG);
|
||||
}
|
||||
|
||||
struct reg_alloc_info {
|
||||
unsigned first_src_n;
|
||||
unsigned reg;
|
||||
unsigned num_reused;
|
||||
};
|
||||
|
||||
/* Allocate alias registers for an alias group while trying to minimize the
|
||||
* number of needed aliases. That is, if the allocated GPRs for the group are
|
||||
* (partially) consecutive, only allocate aliases to fill-in the gaps. For
|
||||
* example:
|
||||
* sam ..., @{r1.x, r5.z, r1.z}, ...
|
||||
* only needs a single alias:
|
||||
* alias.tex.b32.0 r1.y, r5.z
|
||||
* sam ..., r1.x, ...
|
||||
*/
|
||||
static struct reg_alloc_info
|
||||
alloc_alias(struct alias_table_state *state, struct ir3_instruction *instr,
|
||||
unsigned first_src_n)
|
||||
{
|
||||
assert(first_src_n < instr->srcs_count);
|
||||
|
||||
struct ir3_register *src0 = instr->srcs[first_src_n];
|
||||
assert(src0->flags & IR3_REG_FIRST_ALIAS);
|
||||
|
||||
unsigned num_aliases = 0;
|
||||
|
||||
foreach_src_in_alias_group (alias, instr, first_src_n) {
|
||||
num_aliases++;
|
||||
}
|
||||
|
||||
assert(num_aliases > 0);
|
||||
|
||||
reg_bitset *alloc_regs =
|
||||
(src0->flags & IR3_REG_HALF) ? &state->half_alloc : &state->full_alloc;
|
||||
|
||||
/* All the GPRs used by this alias group that aren't already allocated by
|
||||
* previous alias groups.
|
||||
*/
|
||||
unsigned used_regs[num_aliases];
|
||||
|
||||
foreach_src_in_alias_group_n (alias, alias_n, instr, first_src_n) {
|
||||
if (is_reg_gpr(alias) && !BITSET_TEST(*alloc_regs, alias->num)) {
|
||||
used_regs[alias_n] = alias->num;
|
||||
} else {
|
||||
used_regs[alias_n] = INVALID_REG;
|
||||
}
|
||||
}
|
||||
|
||||
/* Find the register that, when allocated to the first src in the alias
|
||||
* group, will maximize the number of GPRs reused (i.e., that don't need an
|
||||
* alias) in the group.
|
||||
*/
|
||||
unsigned best_reg = INVALID_REG;
|
||||
unsigned best_num_reused = 0;
|
||||
|
||||
foreach_src_in_alias_group_n (alias, alias_n, instr, first_src_n) {
|
||||
if (used_regs[alias_n] == INVALID_REG) {
|
||||
/* No (free) GPR is used by this alias. */
|
||||
continue;
|
||||
}
|
||||
|
||||
if (alias->num < alias_n) {
|
||||
/* To be able to fit the current alias reg in a valid consecutive
|
||||
* range, its GPR number needs to be at least its index in the alias
|
||||
* group. Otherwise, there won't be enough GPR space left before it:
|
||||
* sam, ..., @{r5.w, r0.x, r0.y}, ...
|
||||
* Even though r0.x and r0.y are consecutive, we won't be able to reuse
|
||||
* them since there's no GPR before r0.x to alias to r5.w.
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
if (alias->num + num_aliases - alias_n >= GPR_REG_SIZE) {
|
||||
/* Same reasoning as above but for the end of the GPR space. */
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Check if it's possible to reuse the allocated GPR of the current alias
|
||||
* reg. If we reuse it, all other aliases in this group will have their
|
||||
* GPR number based on the current one and need to be free.
|
||||
*/
|
||||
unsigned first_reg = alias->num - alias_n;
|
||||
|
||||
if (BITSET_TEST_RANGE(*alloc_regs, first_reg,
|
||||
first_reg + num_aliases - 1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Check how many GPRs will be reused with this choice. Note that we don't
|
||||
* have to check previous registers in the alias group since if we can
|
||||
* reuse those, the current alias would have been counted there as well.
|
||||
*/
|
||||
unsigned num_reused = 1;
|
||||
|
||||
for (unsigned i = alias_n + 1; i < num_aliases; i++) {
|
||||
if (used_regs[i] == first_reg + i) {
|
||||
num_reused++;
|
||||
}
|
||||
}
|
||||
|
||||
if (num_reused > best_num_reused) {
|
||||
best_num_reused = num_reused;
|
||||
best_reg = alias->num - alias_n;
|
||||
}
|
||||
}
|
||||
|
||||
if (best_reg == INVALID_REG) {
|
||||
/* No reuse possible, just allocate fresh registers. */
|
||||
best_reg = find_free_alias_regs(alloc_regs, num_aliases);
|
||||
|
||||
/* We can use the full GPR space (4 * 48 regs) to allocate aliases which
|
||||
* is enough to always find a free range that is large enough. The maximum
|
||||
* number of aliases is 12 (src0) + 4 (src1) + 2 (samp_tex) so the worst
|
||||
* case reuse looks something like this (note that the number of aliases
|
||||
* is limited to 16 so in practice, it will never be this bad):
|
||||
* [ ... src1.x..src1.w ... samp_tex.x samp_tex.y ... ]
|
||||
* #GPR 0 11 14 26 27
|
||||
* Here, src1 and samp_tex reuse GPRs in such a way that they leave a gap
|
||||
* of 11 GPRs around them so that the src0 will not fit. There is ample
|
||||
* GPR space left for src0 even in this scenario.
|
||||
*/
|
||||
assert(best_reg != INVALID_REG);
|
||||
}
|
||||
|
||||
/* Mark used registers as allocated. */
|
||||
unsigned end_reg = best_reg + num_aliases - 1;
|
||||
assert(end_reg < GPR_REG_SIZE);
|
||||
assert(!BITSET_TEST_RANGE(*alloc_regs, best_reg, end_reg));
|
||||
BITSET_SET_RANGE(*alloc_regs, best_reg, end_reg);
|
||||
|
||||
/* Add the allocated registers that differ from the ones already used to the
|
||||
* alias table.
|
||||
*/
|
||||
for (unsigned i = 0; i < num_aliases; i++) {
|
||||
unsigned reg = best_reg + i;
|
||||
|
||||
if (used_regs[i] != reg) {
|
||||
struct ir3_register *src = instr->srcs[first_src_n + i];
|
||||
add_table_entry(state, reg, src);
|
||||
}
|
||||
}
|
||||
|
||||
return (struct reg_alloc_info){
|
||||
.first_src_n = first_src_n,
|
||||
.reg = best_reg,
|
||||
.num_reused = best_num_reused,
|
||||
};
|
||||
}
|
||||
|
||||
static int
|
||||
cmp_alloc(const void *ptr1, const void *ptr2)
|
||||
{
|
||||
const struct reg_alloc_info *alloc1 = ptr1;
|
||||
const struct reg_alloc_info *alloc2 = ptr2;
|
||||
return alloc2->num_reused - alloc1->num_reused;
|
||||
}
|
||||
|
||||
static void
|
||||
alloc_aliases(struct alias_table_state *state, struct ir3_instruction *instr,
|
||||
unsigned *regs)
|
||||
{
|
||||
unsigned next_alias_reg = FIRST_ALIAS_REG;
|
||||
unsigned num_alias_groups = 0;
|
||||
|
||||
foreach_src_n (src, src_n, instr) {
|
||||
if (src->flags & IR3_REG_ALIAS) {
|
||||
unsigned alias_reg = next_alias_reg++;
|
||||
regs[src_n] = alias_reg;
|
||||
add_table_entry(state, alias_reg, instr->srcs[src_n]);
|
||||
foreach_src (src, instr) {
|
||||
if (src->flags & IR3_REG_FIRST_ALIAS) {
|
||||
num_alias_groups++;
|
||||
}
|
||||
}
|
||||
|
||||
assert(num_alias_groups > 0);
|
||||
struct reg_alloc_info allocs[num_alias_groups];
|
||||
unsigned alloc_i = 0;
|
||||
|
||||
/* We allocate alias registers in two phases:
|
||||
* 1. Allocate each alias group as if they are the only group. This way, the
|
||||
* number of registers they can reuse is maximized (because they will never
|
||||
* conflict with other groups). We keep track of the number of reused
|
||||
* registers per group.
|
||||
*/
|
||||
foreach_src_n (src, src_n, instr) {
|
||||
if (src->flags & IR3_REG_FIRST_ALIAS) {
|
||||
allocs[alloc_i++] = alloc_alias(state, instr, src_n);
|
||||
clear_table(state);
|
||||
}
|
||||
}
|
||||
|
||||
/* 2. Do the actual allocation of the groups ordered by decreasing number of
|
||||
* reused registers. This results in a greater (though not necessarily
|
||||
* optimal) total number of reused registers and, thus, a smaller number of
|
||||
* table entries. This helps in situations like this:
|
||||
* sam ..., @{r0.z, r1.y}, @{r0.w, r1.x}
|
||||
* The first group can reuse 1 register while the second 2. All valid
|
||||
* choices to reuse one register in the first group (r0.z/r0.w or r1.x/r1.y)
|
||||
* lead to an overlap with the second group which means that no reuse is
|
||||
* possible in the second group:
|
||||
* alias.tex.b32.2 r0.w, r1.y
|
||||
* alias.tex.b32.0 r40.x, r0.w
|
||||
* alias.tex.b32.0 r40.y, r1.x
|
||||
* sam ..., r0.z, r40.x
|
||||
* Allocating the second group first leads to an optimal allocation:
|
||||
* alias.tex.b32.1 r40.x, r0.z
|
||||
* alias.tex.b32.0 r40.y, r1.y
|
||||
* sam ..., r40.x, r0.w
|
||||
*/
|
||||
qsort(allocs, num_alias_groups, sizeof(allocs[0]), cmp_alloc);
|
||||
|
||||
/* Mark all GPR sources that cannot be aliased as allocated since we have to
|
||||
* make sure no alias overlaps with them.
|
||||
*/
|
||||
foreach_src (src, instr) {
|
||||
if (can_alias_src(src) && !(src->flags & IR3_REG_ALIAS)) {
|
||||
reg_bitset *alloc_regs = (src->flags & IR3_REG_HALF)
|
||||
? &state->half_alloc
|
||||
: &state->full_alloc;
|
||||
BITSET_SET(*alloc_regs, src->num);
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < num_alias_groups; i++) {
|
||||
struct reg_alloc_info *alloc = &allocs[i];
|
||||
|
||||
/* Check if any allocations made by previous groups can be reused for this
|
||||
* one. For example, this is relatively common:
|
||||
* sam ..., @{r2.z, 0}, @{0}
|
||||
* Reusing the allocation of the first group for the second one gives
|
||||
* this:
|
||||
* alias.tex.b32.0 r2.w, 0
|
||||
* sam ..., r2.z, r2.w
|
||||
*/
|
||||
alloc->reg = find_existing_alloc(state, instr, alloc->first_src_n);
|
||||
|
||||
if (alloc->reg == INVALID_REG) {
|
||||
*alloc = alloc_alias(state, instr, alloc->first_src_n);
|
||||
}
|
||||
|
||||
regs[alloc->first_src_n] = alloc->reg;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue